In [55]:
import csv
import copy
import collections
from operator import itemgetter

DEBUG = True

In [56]:
authors_count = 100
min_edges_for_article = 10

In [57]:
def read_articles():
    articles = []
    with open('DBLP-citation-Jan8.txt') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='#')
        for row in csv_reader:
            if len(row) >= 2:
                data = row[1]
                if len(data)<2:
                    continue
                elif data[:1] == '*':
                    article = {}
                    article['quoted'] = []
                    article['title'] = data[1:]
                elif data[:1] == '@':
                    article['authors'] = data[1:]
                elif data[:1] == 't':
                    article['date'] = data[1:]
                elif data[:5] == 'index':
                    article['index'] = data[5:]
                    articles.append(article)
                elif data[:1] == '%':
                    if 'quoted' in article:
                        article['quoted'].append(data[1:])
                    else:
                        article['quoted'].append(data[1:])
                else:
                    continue
    return articles

def filter_articles_without_authors(articles):
    return [article for article in articles if 'authors' in article]

def find_most_frequent_authors(articles, authors_count):
    top=authors_count
    authors_frequency = collections.defaultdict(int)
    for article in articles:
        authors = article['authors'].split(",")
        for author in authors:
            authors_frequency[author]+=1
    print('All authors ' + str(len(authors_frequency)))
    top_authors = list(sorted(authors_frequency.items(), key=itemgetter(1),reverse=True))
    top_authors = top_authors[:top]
    top_authors_without_occurences = [el[0] for el in top_authors]
    return top_authors_without_occurences

def filter_articles_not_containing_top_authors(articles, top_authors):
    def all_authors_in_top(article, top_authors):
        for author in article['authors'].split(","):
            if author not in top_authors:
                return False
        return True
    return [article for article in articles if all_authors_in_top(article, top_authors)]

def remove_articles_with_few_edges(articles, min_edges_for_article):
    min_edges_cap = min_edges_for_article
    print('All articles ' + str(len(articles)))
    sufficient_edges_articles = [article for article in articles if len(article['quoted']) >= min_edges_cap]
    print('Articles with sufficient edges ' + str(len(sufficient_edges_articles)))
    return sufficient_edges_articles

def update_quoted_by(articles):
    number_of_edges = 0
    all_indexes = set()
    previous = 0
    new = 0
    for article in articles:
        all_indexes.add(article['index'])
    articles_copy = copy.deepcopy(articles)
    for article in articles_copy:
        previous += len(article['quoted'])
        updated_quoted = [quoted for quoted in article['quoted'] if quoted in all_indexes]
        article['quoted'] = updated_quoted
        new += len(article['quoted'])
        number_of_edges += len(article['quoted'])

    return articles_copy

In [58]:
articles = read_articles()

In [59]:
articles = remove_articles_with_few_edges(articles, min_edges_for_article)

All articles 1511035
Articles with sufficient edges 71585


In [60]:
articles = filter_articles_without_authors(articles)

In [61]:
most_frequent_authors = find_most_frequent_authors(articles, authors_count)

All authors 80056


In [62]:
articles = filter_articles_not_containing_top_authors(articles, most_frequent_authors)

In [63]:
articles = update_quoted_by(articles)

Quotations of processed articles 4150
Quotations of remaining articles 110
Qotations of removed articles 4040


In [64]:
len(articles)

220

In [66]:
NODES_FILE = 'dblp_nodes.csv'
EDGES_FILE = 'dblp_edges.csv'

In [67]:
idx_dict = {}

In [68]:
with open(NODES_FILE, 'w') as f:
    idx = 0
    for a in articles:
        f.write(a['index'] + '|' + str(idx) + '|' + a['title'] + '\n')
        idx_dict[a['index']] = idx
        idx += 1

In [71]:
with open(EDGES_FILE, 'w') as f:
    for a in articles:
        for q in a['quoted']:
            f.write(str(idx_dict[q]) + ' ' + str(idx_dict[a['index']]) + ' ' + a['date'] + '\n')
