In [1]:
import nltk
import string
import re 
import itertools
import heapq
import operator
import networkx as nx
import copy
import networkx as nx
from nltk.corpus import stopwords
from nltk import pos_tag
import matplotlib.pyplot as plt
import os
from gensim.models.keyedvectors import KeyedVectors
from sklearn.metrics.pairwise import euclidean_distances as ed
import numpy as np
import os
import pickle
from operator import mul
from nltk.metrics import scores
word_vectors = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)



## Text Clean and Tokenisation

In [2]:
def clean_text_simple(text, remove_stopwords=True, pos_filtering=True, stemming=True):
    
    punct = string.punctuation.replace('-', '')
    # remove punctuation (preserving intra-word dashes)
    text = ''.join(l for l in text if l not in punct)
    # strip extra white space
    text = re.sub(' +',' ',text)
    # strip leading and trailing white space
    text = text.strip()
    # tokenize (split based on whitespace)
    tokens = text.split(' ')
    if pos_filtering == True:
        # apply POS-tagging
        tagged_tokens = pos_tag(tokens)
        # retain only nouns and adjectives
        tokens_keep = []
        for i in range(len(tagged_tokens)):
            item = tagged_tokens[i]
            if (
            item[1] == 'NN' or
            item[1] == 'NNS' or
            item[1] == 'NNP' or
            item[1] == 'NNPS' or
            item[1] == 'JJ' or
            item[1] == 'JJS' or
            item[1] == 'JJR'
            ):
                tokens_keep.append(item[0])
        tokens = tokens_keep
    if remove_stopwords:
        stpwds = stopwords.words('english')
        # remove stopwords
        tokens = [token for token in tokens if token not in stpwds]

    if stemming:
        stemmer = nltk.stem.PorterStemmer()
        # apply Porter's stemmer
        tokens_stemmed = list()
        tokens_unstemmed = list()
        for token in tokens:
            tokens_stemmed.append(stemmer.stem(token).lower())
            tokens_unstemmed.append(token)

    return(tokens_stemmed,tokens_unstemmed)

## Build Simple Graph

In [3]:
def terms_to_graph(terms, w):
    # This function returns a directed, weighted networkx graph from a list of terms (the tokens from the pre-processed text) e.g., ['quick','brown','fox']
    # Edges are weighted based on term co-occurence within a sliding window of fixed size 'w'
    
    from_to = {}
    
    # create initial complete graph (first w terms)
    w = min(w,len(terms))
    terms_temp = terms[0:w]
    indexes = list(itertools.combinations(range(w), r=2))
    
    new_edges = []
    
    for my_tuple in indexes:
        new_edges.append(tuple([terms_temp[i] for i in my_tuple]))
    
    for new_edge in new_edges:
        if new_edge in from_to:
            from_to[new_edge] += 1
        else:
            from_to[new_edge] = 1
    
    # then iterate over the remaining terms
    for i in xrange(w, len(terms)):
        # term to consider
        considered_term = terms[i]
        # all terms within sliding window
        terms_temp = terms[(i-w+1):(i+1)]
        
        # edges to try
        candidate_edges = []
        for p in xrange(w-1):
            candidate_edges.append((terms_temp[p],considered_term))
            
        for try_edge in candidate_edges:
        
            # if not self-edge
            if try_edge[1] != try_edge[0]:
                
                # if edge has already been seen, update its weight
                if try_edge in from_to:
                    from_to[try_edge] += 1
                
                # if edge has never been seen, create it and assign it a unit weight     
                else:
                    from_to[try_edge] = 1
    
    # create empty graph
    g = nx.DiGraph()
    
    for edge in from_to :
        g.add_edge(edge[0],edge[1],weight=from_to[edge])
    
    degree_dict = g.degree(weight='weight')
    nx.set_node_attributes(g,'weight',degree_dict)
    return(g)

## Word Attraction Force Helper Functions

In [4]:
def my_vector_getter(word, word_vectors):
    try:
        word_representation = word_vectors[word].reshape(1,-1)
        return (word_representation)
    except KeyError:
        return (np.random.uniform(-0.25,0.25,300).reshape(1,-1))
        
def my_euclidean_distance(word1, word2, word_vectors):
    distance = ed(my_vector_getter(word1, word_vectors),my_vector_getter(word2, word_vectors))
    return (round(distance, 4))

## WAF Graph Builder

In [5]:
def terms_to_graph_word_attraction(terms_stemmed,terms_unstemmed,w):
    # This function returns a directed, weighted networkx graph from a list of terms (the tokens from the pre-processed text) e.g., ['quick','brown','fox']
    # Edges are weighted based on term co-occurence, word2vec vector and the respective frequencies within a sliding window of fixed size 'w'
    from_to={}
    
    # create initial complete graph (first w terms)
    w = min(w,len(terms_stemmed))
    terms_temp = terms_stemmed[0:w]
    indexes = list(itertools.combinations(range(w), r=2))
    
    new_edges = []
    

    for my_tuple in indexes:
        new_edges.append(tuple([terms_temp[i] for i in my_tuple]))
    
    for new_edge in new_edges:
        if new_edge in from_to:
            from_to[new_edge] += 1
        else:
            from_to[new_edge] = 1
    
    # then iterate over the remaining terms
    for i in xrange(w, len(terms_stemmed)):
        # term to consider
        considered_term = terms_stemmed[i]
        # all terms within sliding window
        terms_temp = terms_stemmed[(i-w+1):(i+1)]
        
        # edges to try
        candidate_edges = []
        for p in xrange(w-1):
            candidate_edges.append((terms_temp[p],considered_term))
            
        for try_edge in candidate_edges:
        
            # if not self-edge
            if try_edge[1] != try_edge[0]:
                
                # if edge has already been seen, update its weight
                if try_edge in from_to:
                    from_to[try_edge] += 1
                
                # if edge has never been seen, create it and assign it a unit weight     
                else:
                    from_to[try_edge] = 1
    
    # create empty graph
    min_attr = float("inf")
    edgelist = from_to.keys()
    waf = {}
    for edge in edgelist :
        word1 = edge[0]
        word2 = edge[1]
        if word1 != word2 :
            word1_freq = terms_stemmed.count(word1)
            word2_freq = terms_stemmed.count(word2)
            word1_unstemmed = terms_unstemmed[terms_stemmed.index(word1)]
            word2_unstemmed = terms_unstemmed[terms_stemmed.index(word2)]
            distance = my_euclidean_distance(word1_unstemmed,word2_unstemmed,word_vectors)
            force = round(word1_freq * word2_freq / float(distance * distance), 5)
            dice = 2*from_to[edge]/(word1_freq*word2_freq)
            attr = dice*force
            if attr!=0:
                min_attr = min(attr,min_attr)
                waf[edge] = attr
    
    g = nx.DiGraph()
    for item in waf :
        if waf[item]!=0:
            waf[item] = round(waf[item]*1.0/min_attr)
            g.add_edge(item[0],item[1],weight=waf[item])
    
    degree_dict = g.degree(weight='weight')
    nx.set_node_attributes(g,'weight',degree_dict)
    return(g)

## Hulth Dataset Graph Contructors

In [None]:
total = os.listdir("EMNLP_2016-master/data/Hulth2003/validation_training/validation/")
path = "EMNLP_2016-master/data/Hulth2003/validation_training/validation/"
abstract_files = [filename for filename in total if '.abstr' in filename]
stemmer = nltk.stem.PorterStemmer()

for window_size in range(3,15) :
    print "Window Size = ",window_size
    for filename in abstract_files :
        filepath =  path+filename
        with open(filepath) as file :
            text = file.read()
        text = " ".join(text.strip().split())
        tokens_stemmed,tokens_unstemmed = clean_text_simple(text)
        #Traditional Graph of Words
        G = terms_to_graph(tokens_stemmed,window_size)
        G.remove_edges_from(G.selfloop_edges())
        nodes = G.nodes()
        H = nx.convert_node_labels_to_integers(G)
        edgelist_file = "Hulth-edgelists/"+filename+"_"+str(window_size)+"_normal.edgelist"
        edgelist_weighted_file = "Hulth-edgelists/"+filename+"_"+str(window_size)+"_weighted.edgelist"
        edgelist_waf_file = "Hulth-edgelists/"+filename+"_"+str(window_size)+"_waf.edgelist"
        nodelist_file = "Hulth-edgelists/"+filename+"_"+str(window_size)+"_normal_node.pickle"
        nodelist_waf_file = "Hulth-edgelists/"+filename+"_"+str(window_size)+"_waf_node.pickle"
        with open (nodelist_file,'wb') as f :
            pickle.dump(nodes, f, pickle.HIGHEST_PROTOCOL)
        nx.write_edgelist(H,edgelist_file,comments='#', data=False, encoding='utf-8')
        nx.write_edgelist(H,edgelist_weighted_file,comments='#', data=True, encoding='utf-8')
        
        #Word Attraction Force Graph of Words
        G_waf = terms_to_graph_word_attraction(tokens_stemmed,tokens_unstemmed,window_size)
        G_waf.remove_edges_from(G_waf.selfloop_edges())
        nodes = G_waf.nodes()
        with open (nodelist_waf_file,'wb') as f :
            pickle.dump(nodes, f, pickle.HIGHEST_PROTOCOL)
        H = nx.convert_node_labels_to_integers(G_waf)
        nx.write_edgelist(H,edgelist_waf_file,comments='#', delimiter=' ', data=True, encoding='utf-8')