In [1]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

%cd "/content/drive/My Drive/CitNet"
%ls

Mounted at /content/drive
/content/drive/My Drive/CitNet
 confusion_matrix.ipynb   data_summary.ipynb   [0m[01;34moutput[0m/
 [01;34mdata[0m/                   'Deep GCN.ipynb'      PCA.ipynb
'Data Summary.gdoc'       [01;34mmethod[0m/              train_test_split.ipynb


In [0]:
import os
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [0]:
DATA_TYPE = 'small'

if DATA_TYPE == 'small':
    FEATURE_PATH = "data/small/feature.csv"
    LABEL_PATH = "data/small/target_with_mask_80.csv"
    OUTPUT_PATH = "output/small/NB.csv"
    NODE_NUM = 2708
    FEATURE_NUM = 1432
else:
    FEATURE_PATH = "data/raw/feature.csv"
    LABEL_PATH = "data/raw/target_with_mask_80.csv"
    OUTPUT_PATH = "output/small/NB.csv"
    NODE_NUM = 11881
    FEATURE_NUM = 9568

In [0]:
def read_features():    
    features = np.zeros((NODE_NUM, FEATURE_NUM), dtype=np.float32)
    with open(FEATURE_PATH, "r") as f:
        next(f)
        for line in f:
            node_id, feature_id, feature_val = line.strip().split(",")
            node_id = int(node_id)
            feature_id = int(feature_id)
            feature_val = float(feature_val)

            features[node_id, feature_id] = feature_val    
    return features
    
def read_labels():
    labels = np.zeros((NODE_NUM, 2), dtype=np.int64)
    # labels = np.zeros((NODE_NUM, ), dtype=np.int64)
    with open(LABEL_PATH, "r") as f:
        next(f)
        for line in f:
            node_id, class_id, is_train = line.strip().split(",")
            node_id = int(node_id)
            
            if DATA_TYPE == "small":
                class_id = int(class_id)
            else:
                class_id = int(class_id.split("\t")[0])  # May need random here

            # labels[node_id] = class_id
            labels[node_id, 0] = class_id
            labels[node_id, 1] = is_train
    return labels

def split_dataset():
    features = read_features()
    labels = read_labels()
    
    training_idx = np.where(labels[:, 1] == 1)
    testing_idx = np.where(labels[:, 1] == 0)
    
    X_train = features[training_idx]
    Y_train = np.squeeze(labels[training_idx, 0])
    X_test = features[testing_idx]
    Y_test = np.squeeze(labels[testing_idx, 0])


    # scaler = StandardScaler()
    # scaler.fit(X_train)
    # X_train = scaler.transform(X_train)
    # X_test = scaler.transform(X_test)

    # pca = PCA(n_components=0.99)
    # pca.fit(X_train)
    # X_train = pca.transform(X_train)
    # # print(X_train.shape)

    # X_test = pca.transform(X_test)
    # # print(X_test.shape)

    return X_train, Y_train, X_test, Y_test

In [295]:
X_train, Y_train, X_test, Y_test = split_dataset()

print("Train: X=> {}, Y => {}".format(X_train.shape, Y_train.shape))
print("Test : X=> {}, Y => {}".format(X_test.shape, Y_test.shape))

Train: X=> (2166, 1432), Y => (2166,)
Test : X=> (542, 1432), Y => (542,)


In [296]:
if DATA_TYPE == "small":
    model = BernoulliNB()
else:
    model = MultinomialNB()

model.fit(X_train, Y_train)
accuracy = model.score(X_test, Y_test)

print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 76.20%


In [0]:
# import pandas as pd

# res = list(zip(tests, preds))

# df = pd.DataFrame(res)
# df.to_csv(OUTPUT_PATH, header=["gt", "pred"], index=False)