In [1]:
import pandas as pd
import csv
import json

import string
import nltk
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from googletrans import Translator

In [2]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ananya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ananya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

#stemmer = SnowballStemmer("english") # "Having" -> have; based -> base; calculi -> calculi
lemmatizer = WordNetLemmatizer() # "Having" -> Having (n); based -> based (n); calculi -> calculus

In [4]:
#tokens_not_considered = {'a','an','the','on','in','of','for','by','using','based','and','or','but','so','because','as','at','to','from','with'}
tokens_not_considered = STOPWORDS
tokens_not_considered -= {'system', 'computer'}
tokens_not_considered |= {'based'}
tokens_not_considered |= {w.title() for w in tokens_not_considered}
tokens_not_considered |= {"``", "''", "-"}
#print(tokens_not_considered)
tokens_not_considered

frozenset({"''",
           '-',
           'A',
           'About',
           'Above',
           'Across',
           'After',
           'Afterwards',
           'Again',
           'Against',
           'All',
           'Almost',
           'Alone',
           'Along',
           'Already',
           'Also',
           'Although',
           'Always',
           'Am',
           'Among',
           'Amongst',
           'Amoungst',
           'Amount',
           'An',
           'And',
           'Another',
           'Any',
           'Anyhow',
           'Anyone',
           'Anything',
           'Anyway',
           'Anywhere',
           'Are',
           'Around',
           'As',
           'At',
           'Back',
           'Based',
           'Be',
           'Became',
           'Because',
           'Become',
           'Becomes',
           'Becoming',
           'Been',
           'Before',
           'Beforehand',
           'Behind',
           'Being',
        

In [5]:
def print_first_n_dict_items(d: dict, dict_name: str, n: int):
    print("\nPrinting first", n, "items of dict named", dict_name, "\n========================================================")
    i = 0
    for key, value in d.items():
        print(key, ":", value)
        i += 1
        if i == n:
            break

In [6]:
def preprocess_data(filename, display_count):
    line_count = 0
    title_lemma, keyword_years = dict(), dict()
    title_years_authors = []
    with open(filename, encoding="UTF", newline='') as csvfile:
        articles = csv.reader(csvfile, delimiter=',')
        
        for row in articles:
            if row:
                if line_count > 0:
                    article_id = int(row[0])
                    year = 0 if row[-1] == '' else int(row[-1])
                    title = row[-6].translate(str.maketrans('', '', ",.;:'?!/`()[]{}<>\\"))
                    
                    #translator = Translator()
                    #if translator.translate(title).src == 'en': # Considering only the English titles
                    title_years_authors.append([article_id, title, year, row[3]])
                    title_lemma[title] = []

                    words = nltk.word_tokenize(title)
                    for word in words:
                        word = lemmatizer.lemmatize(word.lower())
                        #word = stemmer.stem(word)
                        if word not in tokens_not_considered:
                            title_lemma[title].append(word)

                            if word not in keyword_years:
                                keyword_years[word] = [year]
                            else:
                                keyword_years[word].append(year)

                    if line_count < display_count:
                        print(row)
                        print(title_years_authors[article_id])
                        print(title_lemma[title])
                        print(keyword_years)
                else:
                    print(row, end='\n----------------------------------------------------------------------------------------\n')
                line_count += 1
    
    print("No. of Lines:", line_count)
    
    return title_years_authors, title_lemma, keyword_years

In [7]:
title_years_authors, title_lemma, keyword_years = preprocess_data("./data/dblp_article.csv", 2)

['id', 'author', 'author-aux', 'author-orcid', 'booktitle', 'cdate', 'cdrom', 'cite', 'cite-label', 'crossref', 'editor', 'editor-orcid', 'ee', 'ee-type', 'i', 'journal', 'key', 'mdate', 'month', 'note', 'note-type', 'number', 'pages', 'publisher', 'publtype', 'sub', 'sup', 'title', 'title-bibtex', 'tt', 'url', 'volume', 'year']
----------------------------------------------------------------------------------------
['0', 'Sanjeev Saxena', '', '', '', '', '', '', '', '', '', '', 'https://doi.org/10.1007/BF03036466', '', '', 'Acta Inf.', 'journals/acta/Saxena96', '2017-05-28', '', '', '', '7', '607-619', '', '', '', '', 'Parallel Integer Sorting and Simulation Amongst CRCW Models.', '', '', 'db/journals/acta/acta33.html#Saxena96', '33', '1996']
[0, 'Parallel Integer Sorting and Simulation Amongst CRCW Models', 1996, '']
['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model']
{'parallel': [1996], 'integer': [1996], 'sorting': [1996], 'simulation': [1996], 'crcw': [1996], 'model

In [8]:
keyword_counts = {kword:len(years) for kword, years in keyword_years.items()}
keyword_counts = sorted(keyword_counts.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
keyword_counts = {t[0]:t[1] for t in keyword_counts}
#keyword_counts

In [9]:
print("\nNo. of Titles", len(title_years_authors), "\nNo. of keywords", len(keyword_years))
print("Total words:", sum(keyword_counts.values()))


No. of Titles 2008467 
No. of keywords 448770
Total words: 14216934


## Checking results

In [10]:
try:
    print(keyword_years['de'])
except:
    print("Not found")

Not found


In [11]:
with open('./data/dblp_article.csv', encoding="UTF", newline='') as csvfile:
    articles = csv.reader(csvfile, delimiter=',')
    c = 0
    for line in articles:
        if len(line) > 0 and ('*201' in line[-6]):
            print(line[-6])
#print([title for title,words in title_lemma.items() if '**-algebras' in words])

The Independence of Quine's Axioms *200 and *201.


In [12]:
print_first_n_dict_items(keyword_counts, "keyword_counts", 10)


Printing first 10 items of dict named keyword_counts 
system : 191799
network : 166093
model : 111842
analysis : 101884
algorithm : 97362
data : 83604
method : 81866
approach : 73121
problem : 66562
application : 66391


In [13]:
title_years_authors[0:10]
#print(title_years_authors[0:10])
#print_first_n_dict_items(title_years_authors, "title_years_authors", 10)

[[0, 'Parallel Integer Sorting and Simulation Amongst CRCW Models', 1996, ''],
 [1, 'Pattern Matching in Trees and Nets', 1983, ''],
 [2, 'NP-complete Problems Simplified on Tree Schemas', 1983, ''],
 [3, 'On the Power of Chain Rules in Context Free Grammars', 1982, ''],
 [4,
  'Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2',
  1977,
  ''],
 [5, 'A characterization of rational D0L power series', 2011, ''],
 [6, 'The Derivation of Systolic Implementations of Programs', 1987, ''],
 [7, 'Fifo Nets Without Order Deadlock', 1988, ''],
 [8,
  'On the Complementation Rule for Multivalued Dependencies in Database Relations',
  1978,
  ''],
 [9, 'Equational weighted tree transformations', 2012, '']]

In [14]:
print_first_n_dict_items(title_lemma, "title_lemma", 10)


Printing first 10 items of dict named title_lemma 
Parallel Integer Sorting and Simulation Amongst CRCW Models : ['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model']
Pattern Matching in Trees and Nets : ['pattern', 'matching', 'tree', 'net']
NP-complete Problems Simplified on Tree Schemas : ['np-complete', 'problem', 'simplified', 'tree', 'schema']
On the Power of Chain Rules in Context Free Grammars : ['power', 'chain', 'rule', 'context', 'free', 'grammar']
Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2 : ['schnelle', 'multiplikation', 'von', 'polynomen', 'über', 'körpern', 'der', 'charakteristik', '2']
A characterization of rational D0L power series : ['characterization', 'rational', 'd0l', 'power', 'series']
The Derivation of Systolic Implementations of Programs : ['derivation', 'systolic', 'implementation', 'program']
Fifo Nets Without Order Deadlock : ['fifo', 'net', 'order', 'deadlock']
On the Complementation Rule for Multivalued Dependencie

In [15]:
print_first_n_dict_items(keyword_years, "keyword_years", 3)


Printing first 3 items of dict named keyword_years 
parallel : [1996, 2000, 1989, 1992, 1993, 1995, 1996, 2001, 1992, 1991, 2000, 1981, 2007, 1981, 1995, 2010, 1988, 1983, 2007, 1976, 1987, 1997, 1994, 1974, 1975, 1998, 1984, 1984, 1984, 1992, 1987, 1983, 1996, 1976, 1982, 1999, 2011, 1992, 2006, 1998, 2005, 2005, 1988, 2001, 1987, 1999, 2010, 1995, 1998, 2014, 2000, 2001, 1996, 2002, 2006, 2018, 2001, 2014, 1992, 2016, 1998, 2007, 1998, 1987, 2013, 1995, 1992, 2018, 1999, 2002, 1992, 1996, 2001, 2006, 1994, 1994, 2002, 1992, 2010, 2007, 1995, 1992, 2018, 2010, 2018, 1998, 1994, 2014, 2017, 2001, 1995, 2002, 1998, 2000, 1992, 2000, 2000, 1989, 1994, 1990, 1988, 2015, 1990, 2010, 2013, 2014, 1995, 1991, 1991, 2003, 1986, 2018, 2003, 2006, 2003, 1987, 2000, 2013, 2018, 2003, 2010, 1995, 2002, 2001, 2001, 1993, 1990, 2014, 2001, 1989, 2016, 2008, 2000, 2001, 2014, 2006, 2000, 2010, 1995, 2013, 2016, 2017, 2017, 2007, 1989, 2018, 1993, 2005, 2004, 1998, 2013, 1992, 2006, 2012, 1991, 1996,

In [16]:
# Read into a DataFrame
keyword_counts_df = pd.DataFrame.from_dict(keyword_counts, orient='index', columns=['count'])
keyword_counts_df.head(100)

Unnamed: 0,count
system,191799
network,166093
model,111842
analysis,101884
algorithm,97362
data,83604
method,81866
approach,73121
problem,66562
application,66391


## Save Output to File

### 1. Keyword-counts

In [17]:
f = open("./data/keyword_counts.json","w")
json.dump(keyword_counts, f)
f.close()

In [18]:
# Check if it can be read
f = open("./data/keyword_counts.json","r")
keyword_counts_json_read = json.load(f)
f.close()

In [19]:
#print(type(keyword_counts_json_read))

# Read into a dataframe
keyword_counts_df = pd.DataFrame.from_dict(keyword_counts_json_read, orient='index', columns=['count'])
keyword_counts_df.head(10)

Unnamed: 0,count
system,191799
network,166093
model,111842
analysis,101884
algorithm,97362
data,83604
method,81866
approach,73121
problem,66562
application,66391


### 2. Keyword-years

In [20]:
f = open("./data/keyword_years.json","w")
json.dump(keyword_years, f)
f.close()

In [21]:
# Check if it can be read
f = open("./data/keyword_years.json","r")
keyword_years_json_read = json.load(f)
f.close()

In [22]:
print(type(keyword_years_json_read))
print_first_n_dict_items(keyword_years_json_read, "keyword_years_json_read", 3)

<class 'dict'>

Printing first 3 items of dict named keyword_years_json_read 
parallel : [1996, 2000, 1989, 1992, 1993, 1995, 1996, 2001, 1992, 1991, 2000, 1981, 2007, 1981, 1995, 2010, 1988, 1983, 2007, 1976, 1987, 1997, 1994, 1974, 1975, 1998, 1984, 1984, 1984, 1992, 1987, 1983, 1996, 1976, 1982, 1999, 2011, 1992, 2006, 1998, 2005, 2005, 1988, 2001, 1987, 1999, 2010, 1995, 1998, 2014, 2000, 2001, 1996, 2002, 2006, 2018, 2001, 2014, 1992, 2016, 1998, 2007, 1998, 1987, 2013, 1995, 1992, 2018, 1999, 2002, 1992, 1996, 2001, 2006, 1994, 1994, 2002, 1992, 2010, 2007, 1995, 1992, 2018, 2010, 2018, 1998, 1994, 2014, 2017, 2001, 1995, 2002, 1998, 2000, 1992, 2000, 2000, 1989, 1994, 1990, 1988, 2015, 1990, 2010, 2013, 2014, 1995, 1991, 1991, 2003, 1986, 2018, 2003, 2006, 2003, 1987, 2000, 2013, 2018, 2003, 2010, 1995, 2002, 2001, 2001, 1993, 1990, 2014, 2001, 1989, 2016, 2008, 2000, 2001, 2014, 2006, 2000, 2010, 1995, 2013, 2016, 2017, 2017, 2007, 1989, 2018, 1993, 2005, 2004, 1998, 2013, 1992

### 3. Title-years-author

In [23]:
print(len(title_years_authors))
#print(title_years_authors[2008413:])
#title_years_authors[2008412:]
title_years_authors[-1]

2008467


[6738037, 'Object ADTs with improvements for Value ADTs', 1991, '']

In [24]:
writer = csv.writer(open("./data/title_years_authors.csv", "w", encoding="UTF", newline=''))
writer.writerows(title_years_authors)

In [25]:
reader = csv.reader(open("./data/title_years_authors.csv", "r", encoding="UTF", newline=''))
line_count = 0
last_row = -1
for row in reader:
    if line_count < 10:
        if int(row[0]) != title_years_authors[line_count][0]:
            print("row[0]:", row[0], "\t var:", title_years_authors[line_count][0])
    line_count += 1
    last_row = row
print("Lines read:", line_count)
print("Last row:", last_row)

Lines read: 2008413
Last row: ['6737983', 'Optimal Static Output Feedback Design By Using a Trust Region Interior Point Method', '2000', '']


In [26]:
i = 0
for val in title_years_authors:
    if i < 10:
        print(val)
    i += 1

[0, 'Parallel Integer Sorting and Simulation Amongst CRCW Models', 1996, '']
[1, 'Pattern Matching in Trees and Nets', 1983, '']
[2, 'NP-complete Problems Simplified on Tree Schemas', 1983, '']
[3, 'On the Power of Chain Rules in Context Free Grammars', 1982, '']
[4, 'Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2', 1977, '']
[5, 'A characterization of rational D0L power series', 2011, '']
[6, 'The Derivation of Systolic Implementations of Programs', 1987, '']
[7, 'Fifo Nets Without Order Deadlock', 1988, '']
[8, 'On the Complementation Rule for Multivalued Dependencies in Database Relations', 1978, '']
[9, 'Equational weighted tree transformations', 2012, '']


In [27]:
# # JSON Write
# w = csv.writer(open("./data/title_years_authors.csv", "w", encoding="UTF", newline=''))
# for key, val in title_years_authors.items():
#     w.writerow([key, val])

In [28]:
# with open('./data/titles_years_authors.json') as titles_years_auth_json_file:  
#     data = json.load(titles_years_auth_json_file)
#     i = 0
#     for d in data:
#         if i < 10:
#             print(d)
#         i += 1

### 4. Title-lemmatized

In [29]:
f = open("./data/titles_lemmatized.json","w")
json.dump(title_lemma, f)
f.close()

In [30]:
# Check if it can be read
f = open("./data/titles_lemmatized.json","r")
title_lemma_json_read = json.load(f)
f.close()

In [31]:
print(type(title_lemma_json_read))
print_first_n_dict_items(title_lemma_json_read, "title_lemma_json_read", 10)

<class 'dict'>

Printing first 10 items of dict named title_lemma_json_read 
Parallel Integer Sorting and Simulation Amongst CRCW Models : ['parallel', 'integer', 'sorting', 'simulation', 'crcw', 'model']
Pattern Matching in Trees and Nets : ['pattern', 'matching', 'tree', 'net']
NP-complete Problems Simplified on Tree Schemas : ['np-complete', 'problem', 'simplified', 'tree', 'schema']
On the Power of Chain Rules in Context Free Grammars : ['power', 'chain', 'rule', 'context', 'free', 'grammar']
Schnelle Multiplikation von Polynomen über Körpern der Charakteristik 2 : ['schnelle', 'multiplikation', 'von', 'polynomen', 'über', 'körpern', 'der', 'charakteristik', '2']
A characterization of rational D0L power series : ['characterization', 'rational', 'd0l', 'power', 'series']
The Derivation of Systolic Implementations of Programs : ['derivation', 'systolic', 'implementation', 'program']
Fifo Nets Without Order Deadlock : ['fifo', 'net', 'order', 'deadlock']
On the Complementation Rule fo

## Clear data

In [32]:
del keyword_counts_json_read
del keyword_counts_df

del keyword_years_json_read

del title_lemma_json_read