In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import xnet
from igraph import *
import visualization
import pickle
import threading
import numpy as np
import util
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.spatial.distance import cosine as cos_dist
from collections import defaultdict
import concurrent.futures
from scipy import sparse
import pickle
import itertools

In [4]:
from nltk.corpus import stopwords
nltk.download('stopwords')
my_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/carol_mb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
stemmer = nltk.stem.SnowballStemmer('english')

In [4]:
net = util.get_net()

['abstract', 'authors_idx', 'authors_name', 'hasAbstract', 'source', 'title', 'year']
596786 7074118
576619 6712527
513586 5489623
513585 5489608
511613 5484471
267597 2183498


In [7]:
indegrees = np.asarray(net.indegree())
invalid_vtxs = [i for i,d in enumerate(indegrees) if d <= 4]
net.delete_vertices(invalid_vtxs)
print(net.vcount(),net.ecount())

years = net.vs['year']
unique,count = np.unique(years,return_counts=True)
for u,c in zip(unique,count):
    print(u,c)

113730 995638
2001.0 7824
2002.0 9080
2003.0 8254
2004.0 8843
2005.0 9407
2006.0 8570
2007.0 8110
2008.0 8905
2009.0 8191
2010.0 7810
2011.0 7434
2012.0 7567
2013.0 6191
2014.0 4627
2015.0 2460
2016.0 457


In [8]:
def text_filter(text):
    '''all words in lower case'''
    text = text.lower()
        
    '''removing dot, commas, etc'''
    special_chars = ['.',',',':',';','!','/','?','^','\\','|','"','(',')','[',']','{','}','=','_','$','〈','〉','+']
    for char in special_chars:
        text = text.replace(char,'')
    text = text.replace('-',' ')
    
    '''removing stop words and stemming the rest'''
    text = text.split(' ')
    clear_text = ''
    for word in text:
        if word == '':
            continue
        if not word in my_stopwords:
            word = stemmer.stem(word)
            clear_text += word + ' '
    text = clear_text[:-1]
    
    return text

In [9]:
# text_filter('crystal statistics i a two-dimensional model with an order-disorder transition two-dimensional model an order-disorder transition:: the partition function a two-dimensional "ferromagnetic" model the case the eigenwert problem the corresponding computation a long strip crystal finite width $n$ a cylinder, direct product decomposition; the special case a sum. the choice different interaction }}$ the problem. the two-way infinite crystal an order-disorder transition a temperature $t={t}_{c}$ the condition the energy a continuous function $t$; the specific heat $\ensuremath{-}log |t\ensuremath{-}{t}_{c}|$. the maximum the specific heat the order-converting dual transformation a simple automorphism the basis the quaternion algebra the problem hand. addition the massive crystal, the free energy boundary opposite order this basis the mean length a strip crystal tanh }^{n}.$')
net.vs['title_abstract'] = [text_filter(t+' '+a) for t,a in zip(net.vs['title'],net.vs['abstract'])]

In [10]:
vectorizer = TfidfVectorizer(min_df=3)
X = vectorizer.fit_transform(net.vs['title_abstract'])

In [11]:
similarity_net = Graph()
similarity_net.add_vertices(X.shape[0])
cut_param = 0.1
edges = []
weights = []
delta = 10000

def bla(X,i):
    print(i,i+delta)
    sim_mtx = X.dot(X[i:i+delta].T)
    sim_mtx = sim_mtx.tocoo()
    for a,b,v in zip(sim_mtx.row,sim_mtx.col,sim_mtx.data):
        if a <= b:
            continue
        if v > cut_param:
            edges.append([a,i+b])
            weights.append(v)

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    for i in range(0,X.shape[0],delta):
        future = executor.submit(bla,X,i)
        print(future.result())

0 10000
None
10000 20000


ValueError: row index exceeds matrix dimensions

In [None]:
similarity_net.add_edges(edges)
similarity_net.es['weights'] = weights
xnet.igraph2xnet(similarity_net,'data/sim_net.xnet')

In [None]:
def plot(begin,sim_delta,colab_delta,net=None):
    '''
    generates the citation (a1 cites a2) net considering an interval of time
    '''
    citation_net,valid_authors = visualization.author_cites_author(net,begin,sim_delta)

    citation_net = visualization.repre_attribute(citation_net,mode='out')
    with open('temp/citation_net_'+str(begin), 'wb') as output:
        pickle.dump(citation_net, output, pickle.HIGHEST_PROTOCOL)

    with open('temp/citation_net_'+str(begin),'rb') as input:
        citation_net = pickle.load(input)

    '''
    coauthorship pairs considering the original citation network, an interval of time and the valid authors
    '''
    # force = visualization.author_colabs_author(net,begin+sim_delta,colab_delta,valid_authors)
    # with open('temp/force_'+str(begin), 'wb') as output:
    #     pickle.dump(force, output, pickle.HIGHEST_PROTOCOL)

    with open('temp/force_'+str(begin),'rb') as input:
        force = pickle.load(input)

    coauthorship_pairs = list(force.keys())
    # print('coauthorship_pairs',coauthorship_pairs)
    sims = visualization.calculate_sim(citation_net,coauthorship_pairs)
    with open('temp/sims_'+str(begin), 'wb') as output:
        pickle.dump(sims, output, pickle.HIGHEST_PROTOCOL)

    visualization.plot_sim_vs_colab(sims,force,begin)

In [None]:
m1 = sparse.csr_matrix([[1,0],[1,0],[0,0],[1,2]])
m2 = m1.tocoo()
for a,b,c in zip(m2.row,m2.col,m2.data):
    print(a,b,c)

In [None]:
# net = Graph(directed=True)
# net.add_vertices(4)
# net.vs['name'] = ['p1','p2','p3','p4']
# net.vs['year'] = [2010,2009,2001,2000]
# net.vs['authors_idx'] = ['a1,a2','a1,a3,a4','a1,a2,a3','a4']
# net.add_edges([(0,2),(0,3),(1,0),(1,2),(1,3),(2,3)])

In [None]:
# begins = range(2000,2006)
# sim_delta = 10
# colab_delta = 10

# for begin in begins:
#     plot(begin,sim_delta,colab_delta,net)
#     break
# print('Finished')