In [20]:
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from TextDataset import TextDataset
from gnn_model import gcn
import torch.nn.functional as F
import numpy as np
from scipy import sparse
from collections import OrderedDict
from itertools import combinations

In [21]:
# create docList
docList = []
with open("data/gutenold/toy_sentences.txt", 'r') as f:
    for line in f:
        docList.append(line.split('\t')[1].strip())
f.close()

# create labels
labels = []
with open("data/gutenold/toy_labels.txt", 'r') as f:
    for line in f:
        labels.append(line.split('\t')[1].strip())
f.close()

# create vocab
def get_vocab(docList):
    """
    This function takes the dataset and generates the list of vocab with their number of appearances as well using CountVectorizer.
    """

    # docList = [] # get_vocab(datapath + "_docs.txt")
    # docNames = []
    # f = open(docPath, 'rb')
    # for line in f.readlines():
    #     docList.append(line.decode('UTF-8').split('\t')[1].strip())
    #     docNames.append(line.decode('UTF-8').split('\t')[0].strip())
    # f.close()


    tfidf = TfidfVectorizer(min_df=3)
    tfidfvect = tfidf.fit_transform(docList)
    vocab = tfidf.get_feature_names_out()
    print("Vocab is complete")

    return vocab, tfidfvect


vocab, tfidfvect = get_vocab(docList)
def PMIEdges(docList, wordList, window, adj):

    wordList = tuple(wordList)
    wordSet = set(wordList)

    # initializations
    n_i  = OrderedDict((name, 0) for name in wordList)
    word2index = OrderedDict((name,index) for index,name in enumerate(wordList))
    occurrences = np.zeros( (len(wordList),len(wordList)) ,dtype=np.int32)

    counter = 0

    # count word occurences and co-occurences
    for l in docList:
        docSplit = l.split()
        for i in range(len(docSplit) - window + 1):
            # total windows #W
            counter +=1
            d = docSplit[i:i+window]
            e = set()
            # occurences of words #W(i))
            for word in d:
                if word in wordSet:
                    n_i[word] += 1
                    e.add(word)
            # co-occurences of words #W(i, j)
            for w1,w2 in combinations(e,2):
                i1 = word2index[w1]
                i2 = word2index[w2]
                occurrences[i1][i2] += 1
                occurrences[i2][i1] += 1

    
    pmi = occurrences/counter

    for word in n_i:
        if n_i[word] == 0:
            print(word)

    # perform the computations
    p_i = np.array(list(n_i.values()))/counter
    for col in range(len(wordList)):
        pmi[:, col] = pmi[:, col]/p_i[col]
    for row in range(len(wordList)):
        pmi[row, :] = pmi[row,:]/p_i[row]
    pmi = pmi + 1e-9
    for col in range(len(wordList)):
        pmi[:, col] = np.log(pmi[:, col])

    print("computations complete")
    

    # add into adjacency matrix
    for i in range(len(wordList)):
        for j in range(len(wordList)):
            if i == j:
                adj[i,j] = 1
            elif pmi[i, j] > 0:
                adj[i,j] = pmi[i,j]

    print("PMI is complete")
    return adj


def build_graph(docList, wordEdges = "PMI", window = 10):
    """
    This function takes the dataset and generates an adjacency matrix based on the specifications in Yao et al. (2019).
    Input: string representing path to the dataset, not including entire filename and only it's prefix (e.g. "guten")
    """

    # initialize variables
    numDocs = len(docList)
    wordList, tfvect = get_vocab(docList)
    numWords = len(wordList)
    numNodes =  numWords + numDocs

    # build empty adjacency matrix (sparse) # note here that the first numWords indices are words, and the last numDocs indices are documents
    adj = np.identity(numNodes)

    # build word-to-doc edges
    if wordEdges == "PMI":
        adj = PMIEdges(docList, wordList, window, adj)

    # build word-to-doc edges using TF-IDF
    tfiter = tfvect.toarray()
    for words in range(tfiter.shape[1]):
        for docs in range(tfiter.shape[0]):
            if tfiter[docs, words] > 0:
                adj[words, docs + numWords] = tfiter[docs, words]
                adj[docs+numWords, words] = tfiter[docs, words]
    print("TFIDF is complete")

    # return adjacency matrix using A^ = D^-1/2 * A * D^-1/2
    diag = np.diag(np.power(np.sum(adj, axis = 1), -0.5))
    adj = np.matmul(np.matmul(diag, adj), diag)

    # adj = sparse.csr_matrix(adj)

    return adj



# build adj
adj = build_graph(docList, "PMI", 5)

# create dataset
data = TextDataset(docList, labels, len(docList) + len(vocab), test_size = 1/3)

        

Vocab is complete
Vocab is complete
computations complete
PMI is complete
TFIDF is complete
Vocab is complete
This is the: 29
computations complete
PMI is complete
TFIDF is complete


TypeError: ufunc 'bitwise_xor' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
X = torch.eye(len(docList) + len(vocab))
model = gcn(X, 16, adj)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

def train_model(epochs = 100):
    for epoch in range(1, epochs):
        model.train()
        optimizer.zero_grad()
        F.mse_loss(model()[data.train_mask], data.y_train[data.train_mask]).backward()
        optimizer.step()

@torch.no_grad()
def test_model():
    model.eval()
    logits = model()
    mask1 = data.train_mask
    pred1 = logits[mask1].max(1)[1]
    return 

train_model()
