In [None]:
!pip install ujson igraph xnetwork infomap

In [None]:
from tqdm.auto import tqdm
import os
from os.path import join as PJ
# import bgzf
import struct
import numpy as np
import operator
import gensim
import ujson
import igraph as ig
import xnetwork as xn
import glob
import numpy as np
from tqdm.auto import tqdm
from os.path import join as PJ
import matplotlib.pyplot as plt

In [None]:
from infomap import Infomap
def infomapMembership(vertexCount,edges):
    im = Infomap("-N 10 --ftree --silent --seed %d"%np.random.randint(4294967296, dtype=np.int64));
    im.setVerbosity(0);
    for nodeIndex in range(0,vertexCount):
        im.add_node(nodeIndex)
    for edge in edges:
        im.add_link(edge[0], edge[1]);
    im.run()
    # print("Result")
    # print("\n#node module")
    membership = [0]*vertexCount;
    for node in im.tree:
        if node.is_leaf:
            membership[node.node_id] = node.module_id;
    return membership

In [None]:
def infomapApply(g, weights=None):
    vertexCount = g.vcount()
    if(weights):
        edges = [(e.source, e.target, e[weights]) for e in g.es]
    else:
        edges = g.get_edgelist()

#     if(g.is_directed()):
#         extraOptions = "-d"
#     else:
    extraOptions = ""
    im = Infomap("%s -N 100 --silent --seed %d" %
                 (extraOptions, np.random.randint(4294967296, dtype=np.int64)), markov_time=2.0) # markov_time = Default 1

    im.setVerbosity(0)
    for nodeIndex in range(0, vertexCount):
        im.add_node(nodeIndex)
    for edge in edges:
        if(len(edge) > 2):
            if(edge[2]>0):
                im.addLink(edge[0], edge[1], edge[2])
            im.add_link(edge[0], edge[1], weight=edge[2])
        else:
            im.add_link(edge[0], edge[1])

    im.run()
    membership = [":".join([str(a) for a in membership])
                  for index, membership in im.get_multilevel_modules().items()]

    levelMembership = []
    levelCount = max([len(element.split(":")) for element in membership])
    for level in range(levelCount):
        print(level)
        levelMembership.append(
            [":".join(element.split(":")[:(level+1)]) for element in membership]
        )
    return levelMembership

In [None]:
from nltk.corpus import stopwords;
from nltk.stem.wordnet import WordNetLemmatizer;
import nltk.data;
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize;
from nltk.corpus import wordnet;
import re
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

verboseMode = True;

# Loading manual dictionary an ignore list
if(verboseMode): print("Loading manual dictionary an ignore list.");
replaceDictionary = {};
# with open("replaceDictionary.dat","r") as fp:
# 	for line in fp:
# 		entry = line.strip().split("\t");
# 		if(len(entry)>1):
# 			replaceDictionary[entry[0]] = entry[1];

# ignoreSet = set();
# with open("ignoreSet.dat","r") as fp:
# 	for line in fp:
# 		ignoreSet.add(line.strip());


#Setting up nltk environment
if(verboseMode): print("Setting up nltk environment.");

In [None]:
%load_ext Cython

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('punkt')

In [None]:
%%cython
from nltk.corpus import stopwords;
from nltk.stem.wordnet import WordNetLemmatizer;
import nltk.data;
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize;
from nltk.corpus import wordnet;
import re
import nltk

lmtzr = WordNetLemmatizer();
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
punctuation = re.compile(r'[(\])(\})(\{)(\[).?!,":;()|<>]/ < >');
stopSet = set(stopwords.words('english'));

print(stopSet)

verboseMode = True;

# Loading manual dictionary an ignore list
if(verboseMode): print("Loading manual dictionary an ignore list.");
replaceDictionary = {};
with open("replaceDictionary.txt","r") as fp:
	for line in fp:
		entry = line.strip().split("\t");
		if(len(entry)>1):
			replaceDictionary[entry[0]] = entry[1];

ignoreSet = set();
with open("ignoreSet.txt","r") as fp:
	for line in fp:
		ignoreSet.add(line.strip());


#Setting up nltk environment
if(verboseMode): print("Setting up nltk environment.");


def findWholeWord(w):
	return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

def get_wordnet_pos(treebank_tag):
	if treebank_tag.startswith('J'):
		return wordnet.ADJ
	elif treebank_tag.startswith('V'):
		return wordnet.VERB
	elif treebank_tag.startswith('N'):
		return wordnet.NOUN
	elif treebank_tag.startswith('R'):
		return wordnet.ADV
	else:
		return ''

# def get_wordnet_pos(treebank_tag):
# 	if treebank_tag.startswith('J'):
# 		return -1
# 	elif treebank_tag.startswith('V'):
# 		return -1
# 	elif treebank_tag.startswith('N'):
# 		return wordnet.NOUN
# 	elif treebank_tag.startswith('R'):
# 		return wordnet.ADV
# 	else:
# 		return ''

#Setting up tokenizer
if(verboseMode): print("Setting up tokenizer.");

tokenizerInput = {
	"stopSet":stopSet,
	"punctuation":punctuation,
	"tokenizer":sent_tokenizer,
	"lematizer":lmtzr,
	"sent_tokenize": sent_tokenize,
	"replaceDictionary": replaceDictionary,
	"ignoreSet":ignoreSet
}

def tokenizeString(theString,maximumTokenSize,tokenizerInput,removeStopWords=True):
	stopSet = tokenizerInput["stopSet"];
	lematizer = tokenizerInput["lematizer"];
	tokenizer = tokenizerInput["tokenizer"];
	punctuation = tokenizerInput["punctuation"];
	sent_tokenize = tokenizerInput["sent_tokenize"];
	replaceDictionary = tokenizerInput["replaceDictionary"];
	ignoreSet = tokenizerInput["ignoreSet"];
	wordsList = [];
	titleAbstract = (". ".join(theString.split("::"))).strip();
	wordsSentences = [word_tokenize(t) for t in sent_tokenize(titleAbstract)];
	stopSentence = False;
	for si, words in enumerate(wordsSentences):
		wordsTags = nltk.pos_tag(words);
		if(stopSentence):
			break;
		for wi,wordTag in enumerate(wordsTags):
			word = wordTag[0];
			tag = wordTag[1];

			if word.isdigit() or word[1:].isdigit():
				continue;
# 			if(si>len(wordsSentences)-4 and (word.lower()=="copyright" or (wi>0 and word.lower()=="c" and words[wi-1] == "("  and words[wi+1] == ")" ))):
# 				stopSentence = True;
# 				break;
			word = punctuation.sub("", word);
			convTag = get_wordnet_pos(tag);
			#print "w: "+word;
			if convTag == -1:
				continue
			if(convTag != ''):
				word  = lematizer.lemmatize(word.lower(), convTag);
			else:
				word  = lematizer.lemmatize(word.lower());
			if(len(word)==0 or ((word in stopSet) and removeStopWords) or (word in ignoreSet)):
				continue;
			else:
				if(word in replaceDictionary):
					wordsList.append(replaceDictionary[word]);
				else:
					wordsList.append(word);

	tokens = [set() for i in range(maximumTokenSize)];
	for wordIndex in range(len(wordsList)):
		for tokenSize in range(0,min(wordIndex+1,maximumTokenSize)):
			tokens[tokenSize].add(" ".join(wordsList[(wordIndex-tokenSize):(wordIndex+1)]));
	return tokens;

In [None]:
 # important to level ifomap apply

def convert_membership_to_indices(membership_vector):
    groups_array = np.array(membership_vector)
    _, membership_indices = np.unique(groups_array, return_inverse=True)
    return membership_indices.tolist()

In [None]:
def apply_bardosova(network, jsonFileprefix):

    removeStopWords = True;
    maximumTokenSize = 3; #n-gram
    minKeywordsPerCluster = 10;
    maxKeywordsPerCluster = 10;
    maxClusterNameLength = 150;
    useMajorComponent = True;
    verboseMode = True;

    # Obtaining the major connected component (if needed)
    if(useMajorComponent):
        if(verboseMode): print("Obtaining the major connected component.");
        network = network.clusters("WEAK").giant();

    # Tokenizing the abstracts
    if(verboseMode): print("Tokenizing the abstracts.");
    tokensFrequency = [[] for i in range(maximumTokenSize)];
    tokensGroupFrequency = [{} for i in range(maximumTokenSize)];

    propertiesKeys = set();

    verticesTokens = [];
    for vertexIndex in range(network.vcount()):
        if(vertexIndex%100==0):
            print("Tokenizing: %d/%d             "%(vertexIndex,network.vcount()),end="\r")

    #         for wordsList in tokenList:
    #             tokens = [set() for i in range(maximumTokenSize)];
    #             for wordIndex in range(len(wordsList)):
    #                 for tokenSize in range(0,min(wordIndex+1,maximumTokenSize)):
    #                     tokens[tokenSize].add(" ".join(wordsList[(wordIndex-tokenSize):(wordIndex+1)]));
        verticesTokens.append(tokenizeString(network.vs[vertexIndex]["title"],maximumTokenSize,tokenizerInput));

    print("Done                   ");

    # Obtaining the network community structure
    if(verboseMode): print("Obtaining the network community structure.");


    edgelist = [(e.source,e.target) for e in network.es]
    communities = infomapApply(network)[0]
    # communities = [int(c) for c in communities]
    communities = convert_membership_to_indices(communities)
    print()

    # print("Modularity: %f"%cc.q);

    clusters = [[] for i in range(max(communities)+1)];
    for vertexIndex in range(network.vcount()):
        clusters[communities[vertexIndex]].append(vertexIndex);

    #sorting the clusters by size
    clusters = sorted(clusters,key=len,reverse=True);

    # Getting tokens frequency
    if(verboseMode): print("Getting tokens frequency.");


    tokenFrequencyInClusters = [];
    tokenFrequencyInCorpus = {};

    for clusterIndex in range(len(clusters)):
        cluster = clusters[clusterIndex];
        tokenFrequencyInCluster = {};
        for vertexIndex in cluster:
            tokens = verticesTokens[vertexIndex];
            for tokenSize in range(0,maximumTokenSize):
                for token in tokens[tokenSize]:
                    if(token not in tokenFrequencyInCorpus):
                        tokenFrequencyInCorpus[token] = 0;
                    if(token not in tokenFrequencyInCluster):
                        tokenFrequencyInCluster[token] = 0;
                    tokenFrequencyInCorpus[token] += 1;
                    tokenFrequencyInCluster[token] += 1;
        tokenFrequencyInClusters.append(tokenFrequencyInCluster);

    # Calculating the importance Index
    if(verboseMode): print("Calculating the importance Index.");
    #tokenRelativeFrequencyInClusters = [];
    #tokenRelativeFrequencyOutClusters = [];
    tokenImportanceIndexInClusters = [];

    verticesCount = network.vcount();
    for clusterIndex in range(len(clusters)):
        clusterSize = len(clusters[clusterIndex]);

        tokenFrequencyInCluster = tokenFrequencyInClusters[clusterIndex];

        #tokenRelativeFrequencyInCluster = {};
        #tokenRelativeFrequencyOutCluster = {};
        tokenImportanceIndexInCluster = {};

        for token in tokenFrequencyInCluster:
            nInCluster = tokenFrequencyInCluster[token];
            nOutCluster = tokenFrequencyInCorpus[token]-nInCluster;
            outClusterSize = verticesCount-clusterSize;
            if(nOutCluster==0):
                outClusterSize = 1; #Fix for singletons
            FInCluster = float(nInCluster)/float(clusterSize);
            FOutCluster = float(nOutCluster)/float(outClusterSize);
            importanceIndex = FInCluster-FOutCluster;
            #tokenRelativeFrequencyInCluster[token] = FInCluster;
            #tokenRelativeFrequencyOutCluster[token] = FOutCluster;
            tokenImportanceIndexInCluster[token] = importanceIndex;

        #tokenRelativeFrequencyInClusters.append(tokenRelativeFrequencyInCluster);
        #tokenRelativeFrequencyOutClusters.append(tokenRelativeFrequencyOutCluster);
        tokenImportanceIndexInClusters.append(tokenImportanceIndexInCluster);

    defaultNames = "ABCDEFGHIJKLMNOPQRSTUWVXYZ";
    defaultNamesLength = len(defaultNames);

    clusterKeywords = [];
    minClusterSize = min([len(cluster) for cluster in clusters]);
    maxClusterSize = max([len(cluster) for cluster in clusters]);
    clusterNames = [];
    for clusterIndex in range(len(clusters)):
        cluster = clusters[clusterIndex];
        clusterSize = len(cluster);
        keywords = [v[0] for v in sorted(tokenImportanceIndexInClusters[clusterIndex].items(),key=operator.itemgetter(1),reverse=True)];
        if(maxClusterSize>minClusterSize):
            m = (maxKeywordsPerCluster-minKeywordsPerCluster)/float(maxClusterSize-minClusterSize);
        else:
            m=0;
        keywordsCount = round(m*(clusterSize-minClusterSize)+minKeywordsPerCluster);
        currentKeywords = [];
        while(len(currentKeywords)<keywordsCount and len(keywords)>len(currentKeywords)):
            currentKeywords = keywords[0:keywordsCount];
            jointKeywords = "."+".".join(currentKeywords)+".";
            toRemoveKeywords = [];
            for keyword in currentKeywords:
                if(jointKeywords.find(" %s."%keyword)>=0):
                    toRemoveKeywords.append(keyword);
                elif(jointKeywords.find(".%s "%keyword)>=0):
                    toRemoveKeywords.append(keyword);
            for toRemoveKeyword in toRemoveKeywords:
                keywords.remove(toRemoveKeyword);
                currentKeywords.remove(toRemoveKeyword);
        clusterKeywords.append(currentKeywords);
        #print(currentKeywords);
        clusterName = "";
        if(clusterIndex<defaultNamesLength):
            clusterName += defaultNames[clusterIndex];
        else:
            clusterName += "{%d}"%(clusterIndex);
        clusterName += " - "+", ".join(currentKeywords);
        if(len(clusterName)>maxClusterNameLength):
            clusterName = clusterName[0:maxClusterNameLength-1]+"...";
        for vertexIndex in cluster:
            network.vs[vertexIndex]["ClusterName"] = clusterName;
            network.vs[vertexIndex]["ClusterIndex"] = clusterIndex;
        clusterNames.append(clusterName);
        print(clusterName);
        print(clusterIndex);
        print(currentKeywords);


    # Saving the network
    if(verboseMode): print("Saving the network.");
    # network.vs["kcore"] = network.coreness()

    xn.igraph2xnet(network,fileName=PJ('',"%s_infomap.xnet"%(jsonFileprefix)),ignoredNodeAtts=["Text"]);

In [None]:
file = './Langmuir_Blodgett_films.xnet'
network = xn.xnet2igraph(file)
network.vs['wos_id'] = network.vs['name']
network.vs['name'] = network.vs['title']

print(network.vs['name'][:10])


In [None]:
output_header = 'Langmuir_Blodgett_films'
apply_bardosova(network, output_header)