In [1]:
import os
import json
import re
import numpy as np
from urllib import parse
from tqdm import tqdm_notebook

In [2]:
def parse_row(row):
    category = int(row.split(";")[0])
    row = ";".join(row.split(";")[1:])
    authors_indexes = row.split("http")[0][:-1]
    authors = authors_indexes.split('""')[0][1:].split(".,")
    authors = [(author+".").strip() for author in authors if len(author)>0]
    indexes = authors_indexes.split('""')[1]
    indexes = re.sub(r'[^\w\s]',' ',indexes).split()
    indexes = list(map(int, indexes))
    url = "http" + row.split("http")[1].split('""')[0] 
    keywords = '""'.join(row.split("http")[1].split('""')[1:])[1:].split(";")
    keywords = [re.sub(r'[^\w\s]',' ',keyword) for keyword in keywords]
    keywords = [" ".join(keyword.split()) for keyword in keywords]
    keywords = [keyword for keyword in keywords if len(keyword)>0]
    return [category, authors, indexes, url, keywords]

In [3]:
data = []
for year in ["2011","2012","2013","2014","2015","2016","2017"]:
    with open('data/{year}/authors_keywords_{year}.csv'.format(year = year), 'r') as file:
        for index, row in enumerate(file):
            if index ==0:
                continue
            category, authors, indexes, url, keywords = parse_row(row)
            if len(authors) > 25:
                print(index, len(authors),len(indexes))
            #print(index, len(authors),len(indexes))
            assert(len(authors) == len(indexes))
            out = {"year":year,
                   "category": category,
                   "url": url,
                   "keywords": "; ".join(keywords),
                   "authors": [{"id": idd, "author": author} for idd, author in zip(indexes,authors)]}
            data.append(out)

219 28 28
2483 28 28


In [4]:
import json

## Write
with open('new_data.json', 'w') as outfile:
    json.dump(data, outfile)

In [5]:
## Read
with open('new_data.json') as f:
    new_data = json.load(f)

new_data[0]

{'authors': [{'author': 'Gromov E.', 'id': 56802827200},
  {'author': 'Logvinova K.', 'id': 8284266700},
  {'author': 'Morozov V.', 'id': 37079439900},
  {'author': 'Tyutin V.', 'id': 6603597680}],
 'category': 0,
 'keywords': 'Anomalous diffusion; Fractal operator; Random',
 'url': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-79961238208&partnerID=40&md5=f70f5d965f80944d48a76f2d5327ab3d',
 'year': '2011'}

In [6]:
# TEST
for text_row, answer in [('0;"Gromov E., Logvinova K., Morozov V., Tyutin V.,""56802827200";8284266700;37079439900;6603597680;",https://www.scopus.com/inward/record.uri?eid=2-s2.0-79961238208&partnerID=40&md5=f70f5d965f80944d48a76f2d5327ab3d"",""Anomalous diffusion"; Fractal operator;" Random"",";;;;;;;;;;;;;;;;;;;;;;;;;;;;;',
                        [0,
                         ["Gromov E.", "Logvinova K.", "Morozov V.", "Tyutin V."],
                         [56802827200, 8284266700, 37079439900, 6603597680],
                         "https://www.scopus.com/inward/record.uri?eid=2-s2.0-79961238208&partnerID=40&md5=f70f5d965f80944d48a76f2d5327ab3d",
                         ["Anomalous diffusion", "Fractal operator", "Random"]])]:
    assert(parse_row(text_row)==answer), text_row

In [7]:
len(new_data)

9831

In [8]:
new_data[0]['keywords']

'Anomalous diffusion; Fractal operator; Random'

In [9]:
# Assign to each keyword the category by article's label class (from 0 to 24)

all_keywords_cat = []
for line in range(len(new_data)):
    splitted_line = new_data[line]['keywords'].lower()
    splitted_line = splitted_line.split("; ")
    for word in splitted_line:
            word_cat = {word : new_data[line]['category']}
            all_keywords_cat.append(word_cat)

In [10]:
all_keywords_cat[:9]

[{'anomalous diffusion': 0},
 {'fractal operator': 0},
 {'random': 0},
 {'clusters': 1},
 {'input output model': 1},
 {'networks': 1},
 {'absorptive capacity': 1},
 {'companies': 1},
 {'knowledge management': 1}]

In [11]:
# List of all keywords in all articles
all_keywords = []
for line in range(len(new_data)):
    splitted_line = new_data[line]['keywords'].lower()
    splitted_line = splitted_line.split("; ")
    for word in splitted_line:
            all_keywords.append(word)

In [17]:
print('Size: ', len(all_keywords))
all_keywords[:9]

Size:  95849


['anomalous diffusion',
 'fractal operator',
 'random',
 'clusters',
 'input output model',
 'networks',
 'absorptive capacity',
 'companies',
 'knowledge management']

In [21]:
# Set of all unique keywords in all articles

keywords_dict = set(all_keywords)

print("Всего ключевых слов по всем статьям: ", len(all_keywords))
print("Всего уникальных ключевых слов: ", len(keywords_dict))

Всего ключевых слов по всем статьям:  95849
Всего уникальных ключевых слов:  27836


In [22]:
# For each word assign its unique id
index = 0
list_keywords = list(keywords_dict)
keyword_id = {}
for word in list_keywords:
    keyword_id[word] = int(index)
    index +=1 

keyword_id #dict type

{'': 0,
 'input parameter': 1,
 'influential historians of science of the 20th century': 2,
 'economic news': 3,
 'sense': 4,
 'historical alternations': 5,
 'bootstrap method': 6,
 'russian funeral market': 7,
 'south east asia': 8,
 'optimization procedures': 9,
 'numerical algorithm': 10,
 'electricity demands': 11,
 'maturity levels': 12,
 'bigartm': 13,
 'share price': 14,
 'lms project': 15,
 'pixels': 16,
 'sequential pattern structures artificial intelligence': 17,
 'random condition': 18,
 'human reaction': 19,
 'long terminal repeat': 20,
 'design innovation': 21,
 'sensestructure': 22,
 'russia asean dialogue partnership': 23,
 'potential elites': 24,
 'global richardson extrapolation': 25,
 'medieval ethiopian kingdom': 26,
 'focal point': 27,
 'spectral sensitivity': 28,
 'operation mode': 29,
 'kohn luttinger mechanism': 30,
 'mismatch negativity mmn': 31,
 'non metric': 32,
 'risky behavior behavioral research': 33,
 'tumor necrosis factor': 34,
 'ceo s personal characte

In [23]:
# Create for each article a binarized vector of features

for line in range(len(new_data)):
    new_data[line]['keywords_vector'] = np.zeros((len(keywords_dict),), dtype=int) # zero array of len 27836
    splitted_line = new_data[line]['keywords'].lower()
    splitted_line = splitted_line.split("; ")
    for word in splitted_line:
        if word in keywords_dict:
            its_id = keyword_id[word]
            new_data[line]['keywords_vector'][its_id] = 1

In [26]:
#Example

print(new_data[2]['keywords_vector'], '- вектор длины ', len(new_data[2]['keywords_vector']))
a = new_data[2]['keywords_vector']
print("Количество ненулевых элементов в бинарном векторе строки: ", np.count_nonzero(a))

[0 0 0 ..., 0 0 0] - вектор длины  27836
Количество ненулевых элементов в бинарном векторе строки:  8


In [27]:
# If the list of article's keywords was empty -  [1, 0, ..., 0], where on the first position is 1, other digits - 0.
# Example with empty keywords
new_data[245]

{'authors': [{'author': 'Popov S.V.', 'id': 54416259000},
  {'author': 'Bernhardt D.', 'id': 7006377194}],
 'category': 7,
 'keywords': '',
 'keywords_vector': array([1, 0, 0, ..., 0, 0, 0]),
 'url': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-84865114984&doi=10.1257%2fmic.4.1.116&partnerID=40&md5=4d88e1c6b3922286bf72203239c6f594',
 'year': '2012'}

**Версия 1**

Файл 1 (like cora.content). Строим id + keyword_vector + label_class по id статей.

Статьи не повторяются, поэтому id будут уникальными объектами. Ничего дополнительно с бинарным вектором и меткой класса делать не нужно.

In [29]:
new_lines = []
index = 0
used_ids = set()

for line in tqdm_notebook(range(len(new_data))): # 9831
    # get unique paper_id from url
    url = new_data[line]['url']
    parse.urlsplit(url)
    eid = dict(parse.parse_qsl(parse.urlsplit(url).query))
    eid['eid'] = eid['eid'][7:]
    new_data[line]['ids'] = eid['eid']
    
    # get binarized vector of keywords in a string dtype
    all_vector_values = list(new_data[line]['keywords_vector'])
    vector_string = ''
    for each in all_vector_values:
        vector_string += (str(each) + ' ')
    
    if new_data[line]['ids'] not in used_ids:
        new = str(new_data[line]['ids']) + ' ' + vector_string + ' ' + str(new_data[line]['category'])
        used_ids.add(new_data[line]['ids'])
        new_lines.append(new)





In [30]:
# Example 
print(len(new_lines[0]))
new_lines[0]

55686


'79961238208 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [32]:
# Write file

with open('HSEcite_content.csv', 'w') as file:
    for each in tqdm_notebook(range(len(new_lines))):
        file.write(new_lines[each])
        file.write('\n')




In [35]:
# Read file

#with open('HSEcite_content.csv') as file:

hse_content = new_lines

#print(hse_content[0])

Далее хотим получить второй файл:

id\_статьи +  id\_цитирующей\_статьи

In [36]:
def parse_row_file2(row):
    
    authors = row.split(',""')[0]
    authors_id = row.split('""')[1]
    
    # There are papers without list of reference
    # We simply make empty lists for new variables
    if len(row.split('",')) < 3:
        url = ' '
        paper_id = ' '
        cited_authors = []
    else:
        url = row.split('",')[1][:-2]

        # Get unique paper_id from its url
        parse.urlsplit(url)
        eid = dict(parse.parse_qsl(parse.urlsplit(url).query))
        eid['eid'] = eid['eid'][7:]
        paper_id = eid['eid']

        #Get list of authors' names who were in reference list 
        cited_authors_line = row.split('","')[1]
        regex = re.compile(r'\b([A-Z][a-z]+.(?:.[A-Z]\.)(?:[A-Z]\.)*)')
        cited_authors = regex.findall(cited_authors_line)

    return authors, authors_id, url, paper_id, cited_authors

In [377]:
#Example

('Kotliarov I.',
 ' 25626497500; ',
 ' https://www.scopus.com/inward/record.uri?eid=2-s2.0-84455192541&partnerID=40&md5=0cd17a4a48afa1d6ba88666f57a7c42f',
 '84455192541',
 ['Agrawal, D.',
  'Lal, R.',
  'Azgaldov, G.G.',
  'Karpova, N.N.',
  'Blair, R.D.',
  'Francine, L.',
  'Chaudey, M.',
  'Fadairo, M.',
  'Dnes, A.W.',
  'Kabak, M.L.',
  'Kaufmann, P.J.',
  'Lafontaine, F.',
  'Kotliarov, I.',
  'Lafontaine, F.',
  'Lafontaine, F.',
  'Lafontaine, F.',
  'Bhattacharyya, S.',
  'Martin, R.E.',
  'Mathewson, F.',
  'Winter, R.',
  'Michael, S.M.',
  'Minkler, A.',
  'Norton, S.',
  'Raynaud, E.',
  'Saussier, S.',
  'Perrigot, R.',
  'Rao, R.C.',
  'Shubashri, S.',
  'Rubin, P.',
  'Sen, K.C.',
  'Stazhkova, M.M.'])

In [37]:
data2 = []
with open('data/2017/authors_cites_2011_2017.csv', 'r') as file:
    for index, row in enumerate(file):
        if index ==0:
            continue
        authors, authors_id, url,  paper_id, cited_authors = parse_row_file2(row)
        out = {"authors":authors,
               "authors_id": authors_id,
               "url": url,
               "paper_id": paper_id,
               "cited_authors": cited_authors}
        data2.append(out)

In [39]:
# Write     
with open('HSEcite_cites_prep.json', 'w') as outfile:
    json.dump(data2, outfile)

# Read json
with open('HSEcite_cites_prep.json', 'r') as f:
    hse_cites = json.load(f)
    
hse_cites[0]

{'authors': '"Kotliarov I.',
 'authors_id': '25626497500";",https://www.scopus.com/inward/record.uri?eid=2-s2.0-84455192541&partnerID=40&md5=0cd17a4a48afa1d6ba88666f57a7c42f',
 'cited_authors': ['Agrawal, D.',
  'Lal, R.',
  'Azgaldov, G.G.',
  'Karpova, N.N.',
  'Blair, R.D.',
  'Francine, L.',
  'Chaudey, M.',
  'Fadairo, M.',
  'Dnes, A.W.',
  'Kabak, M.L.',
  'Kaufmann, P.J.',
  'Lafontaine, F.',
  'Kotliarov, I.',
  'Lafontaine, F.',
  'Lafontaine, F.',
  'Lafontaine, F.',
  'Bhattacharyya, S.',
  'Martin, R.E.',
  'Mathewson, F.',
  'Winter, R.',
  'Michael, S.M.',
  'Minkler, A.',
  'Norton, S.',
  'Raynaud, E.',
  'Saussier, S.',
  'Perrigot, R.',
  'Rao, R.C.',
  'Shubashri, S.',
  'Rubin, P.',
  'Sen, K.C.',
  'Stazhkova, M.M.'],
 'paper_id': '84455192541',
 'url': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-84455192541&partnerID=40&md5=0cd17a4a48afa1d6ba88666f57a7c42'}

In [40]:
# For better understanding of code below
new_data[0]

{'authors': [{'author': 'Gromov E.', 'id': 56802827200},
  {'author': 'Logvinova K.', 'id': 8284266700},
  {'author': 'Morozov V.', 'id': 37079439900},
  {'author': 'Tyutin V.', 'id': 6603597680}],
 'category': 0,
 'ids': '79961238208',
 'keywords': 'Anomalous diffusion; Fractal operator; Random',
 'keywords_vector': array([0, 0, 0, ..., 0, 0, 0]),
 'url': 'https://www.scopus.com/inward/record.uri?eid=2-s2.0-79961238208&partnerID=40&md5=f70f5d965f80944d48a76f2d5327ab3d',
 'year': '2011'}

In [41]:
# Create dictionary of <'author_name' : paper_id >
# (!) Author has the last paper's id if he wac a co-author in several of them

authors_paper_id = {} #len 8370

for line in range(len(new_data)):
    for each in new_data[line]['authors']:
        ids = new_data[line]['ids']
        authors_paper_id[each['author']] = ids

In [43]:
# Example 

authors_paper_id['Gromov E.'], authors_paper_id['Logvinova K.'], authors_paper_id['Morozov V.'], authors_paper_id['Tyutin V.'] 

('85034223654', '79961238208', '79961238208', '79961238208')

In [44]:
# Create a list with <id_paper_cited + id_paper_citing>

data_cites = []

for line in range(len(hse_cites)):
    if hse_cites[line]['cited_authors'] == []:
        cited_paper = 0
        # Do not record papers without references
        #out = str(cited_paper) + ' ' + str(hse_cites[line]['paper_id'])
        #data_cites.append(out)
    else:
        for each in hse_cites[line]['cited_authors']:
            # In other dataset names written w/o comma, delete it to be able to compare all names
            if ',' in each:
                each_author = each.replace(',', '')
            if each_author in authors_paper_id.keys(): #Check if author from references is in set of HSE 2011-2017
                each_author_paper_id = authors_paper_id[each_author] # Assign cited paper_id
                cited_paper = each_author_paper_id
                out = str(cited_paper) + ' ' + str(hse_cites[line]['paper_id'])
                if out not in data_cites and out.split(" ")[0] != '0':
                    data_cites.append(out)

In [46]:
# Example

print('[paper_id_cited + paper_id_citing]')
print('Number of edges: ', len(data_cites))
data_cites[:10]

[paper_id_cited + paper_id_citing]
Number of edges:  21601


['84874778575 84455192541',
 '85027296323 84859080419',
 '84914156243 84859080419',
 '84957402806 82455246366',
 '84948614488 82455246366',
 '85035143983 82455246366',
 '85035220247 82455246366',
 '84955295991 82455246366',
 '85034583726 82455253324',
 '82455253324 82455253324']

In [47]:
with open('HSEcites.json', 'w') as file:
    for each in tqdm_notebook(range(len(data_cites))):
        file.write(data_cites[each])
        file.write('\n')




In [48]:
with open('HSEcite_cites.csv', 'w') as file:
    for each in tqdm_notebook(range(len(data_cites))):
        file.write(data_cites[each])
        file.write('\n')




**Версия 2**

Строим id_автора + его keyword_vector + label_class.

У авторов будет по несколько векторов, тогда поэлементно складываем их.
Пример для какого-нибудь id:

[1, 0, 0, 0, 1, 0, 0, ..., 0]

+

[0, 1, 0, 1, 0, 0, 0, ..., 1]

=

[1, 1, 0, 1, 1, 0, 0, ..., 1] -- итоговый вектор.
