In [1]:
import pandas as pd
import re
import time

In [2]:
class dictionary(dict):
    """
    Extends python dictionary in order to have
    index --> word
    but also
    word --> index
    """
    def __init__(self):
        super(dictionary, self).__init__()
        self.index = {}
        self.size = 0
    
    def __setitem__(self, key, value):
        super(dictionary, self).__setitem__(key, value)
        self.index[value] = key
        self.size += 1
    
    def __delitem__(self, key):
        value = super().pop(key)
        ignore = self.index.pop(value)
        self.size -=1

In [3]:
def process_corpus(corpus, context_size, dictionary, fixed_dictionary=False):
    list_of_points = []
    for document in corpus:
        list_of_points += process_document(document, context_size, dictionary, fixed_dictionary)
    return list_of_points


def process_document(document, context_size, dictionary, fixed_dictionary=False):
    text = document.lower()
    p = re.compile("[a-z]+")
    tokens = p.findall(text)
    list_of_points = []
    for i in range(len(tokens) - context_size + 1):
        data_point = [0 for l in range(context_size)]
        add_new_data_point = True
        for j in range(context_size):
            k = i+j
            if tokens[k] not in dictionary.index:
                if fixed_dictionary:
                    # only takes series of words in the dictionary
                    add_new_data_point = False
                    break
                else:
                    new_Ix = dictionary.size
                    dictionary[new_Ix] = tokens[k]
            data_point[j] = dictionary.index[tokens[k]]
        if add_new_data_point:
            list_of_points.append(tuple(data_point))
    return list_of_points
        


In [4]:
# Define some important values
CONTEXT_SIZE = 4
DICT_SIZE = 17000

### For the arXiv corpus

In [5]:
data = pd.read_csv("./arxiv_articles.csv", sep="|")
s = time.time()
mydict = dictionary()
#dataset = process_corpus(data['summary'], CONTEXT_SIZE, mydict)
#t = time.time() - s
#print("Done in {} seconds".format(int(t)))
data

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,text
id,title,authors,arxiv_primary_category,summary,published,updated
http://arxiv.org/abs/2001.05867v1,$σ$-Lacunary actions of Polish groups,Jan Grebik,math.LO,"We show that every essentially countable orbit equivalence relation induced by a continuous action of a Polish group on a Polish space is $\sigma$-lacunary. In combination with [Invent. Math.201 (1), 309-383, 2015] we obtain a straightforward proof of the result from [Adv. Math.307, 312-343,2017] that every essentially countable equivalence relation that is induced by an action of abelian non-archimedean Polish group is essentially hyperfinite.",2020-01-16T15:09:02Z,2020-01-16T15:09:02Z
http://arxiv.org/abs/1303.6933v1,Hans Grauert (1930-2011),Alan Huckleberry,math.HO,Hans Grauert died in September of 2011. This article reviews his life in mathematics and recalls some detail his major accomplishments.,2013-03-27T19:23:57Z,2013-03-27T19:23:57Z
http://arxiv.org/abs/1407.3775v1,A New Proof of Stirling's Formula,Thorsten Neuschel,math.HO,A new simple proof of Stirling's formula via the partial fraction expansion for the tangent function is presented.,2014-07-10T11:26:39Z,2014-07-10T11:26:39Z
http://arxiv.org/abs/math/0307381v3,On Dequantization of Fedosov's Deformation Quantization,Alexander V. Karabegov,math.QA,"To each natural deformation quantization on a Poisson manifold M we associate a Poisson morphism from the formal neighborhood of the zero section of the cotangent bundle to M to the formal neighborhood of the diagonal of the product M x M~, where M~ is a copy of M with the opposite Poisson structure. We call it dequantization of the natural deformation quantization. Then we ""dequantize"" Fedosov's quantization.",2003-07-30T06:20:33Z,2003-09-20T01:29:18Z
...,...,...,...,...,...,...
http://arxiv.org/abs/1902.05717v1,A New Smoothing Technique based on the Parallel Concatenation of Forward/Backward Bayesian Filters: Turbo Smoothing,Giorgio M. Vitetta;Pasquale Di Viesti;Emilio Sirignano,stat.CO,"Recently, a novel method for developing filtering algorithms, based on the parallel concatenation of Bayesian filters and called turbo filtering, has been proposed. In this manuscript we show how the same conceptual approach can be exploited to devise a new smoothing method, called turbo smoothing. A turbo smoother combines a turbo filter, employed in its forward pass, with the parallel concatenation of two backward information filters used in its backward pass. As a specific application of our general theory, a detailed derivation of two turbo smoothing algorithms for conditionally linear Gaussian systems is illustrated. Numerical results for a specific dynamic system evidence that these algorithms can achieve a better complexity-accuracy tradeoff than other smoothing techniques recently appeared in the literature.",2019-02-15T08:21:22Z,2019-02-15T08:21:22Z
http://arxiv.org/abs/1902.06861v2,Computation of the expected value of a function of a chi-distributed random variable,Paul Kabaila;Nishika Ranathunga,stat.CO,"We consider the problem of numerically evaluating the expected value of a smooth bounded function of a chi-distributed random variable, divided by the square root of the number of degrees of freedom. This problem arises in the contexts of simultaneous inference, the selection and ranking of populations and in the evaluation of multivariate t probabilities. It also arises in the assessment of the coverage probability and expected volume properties of the some non-standard confidence regions. We use a transformation put forward by Mori, followed by the application of the trapezoidal rule. This rule has the remarkable property that, for suitable integrands, it is exponentially convergent. We use it to create a nested sequence of quadrature rules, for the estimation of the approximation error, so that previous evaluations of the integrand are not wasted. The application of the trapezoidal rule requires the approximation of an infinite sum by a finite sum. We provide a new easily computed upper bound on the error of this approximation. Our overall conclusion is that this method is a very suitable candidate for the computation of the coverage and expected volume properties of non-standard confidence regions.",2019-02-19T02:23:36Z,2019-12-17T04:46:30Z
http://arxiv.org/abs/1902.07706v1,EcoMem: An R package for quantifying ecological memory,Malcolm S. Itter;Jarno Vanhatalo;Andrew O. Finley,stat.CO,"Ecological processes may exhibit memory to past disturbances affecting the resilience of ecosystems to future disturbance. Understanding the role of ecological memory in shaping ecosystem responses to disturbance under global change is a critical step toward developing effective adaptive management strategies to maintain ecosystem function and biodiversity. We developed EcoMem, an R package for quantifying ecological memory functions using common environmental time series data (continuous, count, proportional) applying a Bayesian hierarchical framework. The package estimates memory functions for continuous and binary (e.g., disturbance chronology) variables making no a priori assumption on the form of the functions. EcoMem allows users to quantify ecological memory for a wide range of ecosystem processes and responses. The utility of the package to advance understanding of the memory of ecosystems to environmental drivers is demonstrated using a simulated dataset and a case study assessing the memory of boreal tree growth to insect defoliation.",2019-02-20T18:59:55Z,2019-02-20T18:59:55Z
http://arxiv.org/abs/1902.09029v1,Snowboot: Bootstrap Methods for Network Inference,Yuzhou Chen;Yulia R. Gel;Vyacheslav Lyubchich;Kusha Nezafati,stat.CO,"Complex networks are used to describe a broad range of disparate social systems and natural phenomena, from power grids to customer segmentation to human brain connectome. Challenges of parametric model specification and validation inspire a search for more data-driven and flexible nonparametric approaches for inference of complex networks. In this paper we discuss methodology and R implementation of two bootstrap procedures on random networks, that is, patchwork bootstrap of Thompson et al. (2016) and Gel et al. (2017) and vertex bootstrap of Snijders and Borgatti (1999). To our knowledge, the new R package snowboot is the first implementation of the vertex and patchwork bootstrap inference on networks in R. Our new package is accompanied with a detailed user's manual, and is compatible with the popular R package on network studies igraph. We evaluate the patchwork bootstrap and vertex bootstrap with extensive simulation studies and illustrate their utility in application to analysis of real world networks.",2019-02-24T22:31:43Z,2019-02-24T22:31:43Z


In [6]:
mydict.size

64515

In [7]:
len(dataset)

5447489

In [8]:
dataset[:10]

[(0, 1, 2, 3),
 (1, 2, 3, 4),
 (2, 3, 4, 5),
 (3, 4, 5, 6),
 (4, 5, 6, 7),
 (5, 6, 7, 8),
 (6, 7, 8, 9),
 (7, 8, 9, 10),
 (8, 9, 10, 11),
 (9, 10, 11, 12)]

In [9]:
data_df = pd.DataFrame(dataset)

In [10]:
data_df.head()

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,1,2,3,4
2,2,3,4,5
3,3,4,5,6
4,4,5,6,7


In [11]:
# Count the number of occurrences for each word
word_counts = data_df.iloc[:, 0].value_counts()
word_counts

30       371927
14       241963
49       146379
11       136600
68       120327
          ...  
40063         1
35965         1
42447         1
48251         1
63425         1
Name: 0, Length: 63202, dtype: int64

In [12]:
print("These are the most frequent words: {}".format((mydict[30], mydict[14], mydict[49], mydict[11])))

These are the most frequent words: ('the', 'of', 'and', 'a')


In [13]:
# Now, we want to keep only a subset of all the words
# we define a fixed size for the dictionary and we 
# keep the words, starting from the most frequent ones

words2keep = word_counts.keys()[:DICT_SIZE]

In [14]:
words2keep

Int64Index([   30,    14,    49,    11,    68,    22,     0,    19,    64,
                2,
            ...
            55667, 37971, 22882, 18427, 41792, 25705,  6781, 14941, 25061,
            45905],
           dtype='int64', length=17000)

In [15]:
# Now, we create a new dictionary with the
# words selected in the previous step
new_dictionary = dictionary()
for i in range(len(words2keep)):
    new_dictionary[i] = mydict[words2keep[i]]
    



In [16]:
# With the new dictionary, build the new training dataset

# Creating the training dataset using series of 4 words 
# appearing in the text
s = time.time()
new_dataset = process_corpus(data['summary'], CONTEXT_SIZE, new_dictionary, fixed_dictionary=True)
t = time.time() - s
print("Done in {} seconds".format(int(t)))

Done in 11 seconds


In [17]:
len(new_dataset)

5056387

In [18]:
new_dictionary.size

17000

In [19]:
new_dataset[:10]

[(6, 34, 9, 1229),
 (34, 9, 1229, 2101),
 (9, 1229, 2101, 9555),
 (1229, 2101, 9555, 1457),
 (2101, 9555, 1457, 2214),
 (9555, 1457, 2214, 790),
 (1457, 2214, 790, 757),
 (2214, 790, 757, 14),
 (790, 757, 14, 3),
 (757, 14, 3, 347)]

In [20]:
data['summary'][0]

'We show that every essentially countable orbit equivalence relation induced by a continuous action of a Polish group on a Polish space is $\\sigma$-lacunary. In combination with [Invent. Math.201 (1), 309-383, 2015] we obtain a straightforward proof of the result from [Adv. Math.307, 312-343,2017] that every essentially countable equivalence relation that is induced by an action of abelian non-archimedean Polish group is essentially hyperfinite.'

In [21]:
print(new_dictionary[6], new_dictionary[34], new_dictionary[9], new_dictionary[1229])

we show that every


In [22]:
new_dictionary

{0: 'the',
 1: 'of',
 2: 'and',
 3: 'a',
 4: 'to',
 5: 'in',
 6: 'we',
 7: 'is',
 8: 'for',
 9: 'that',
 10: 'with',
 11: 'this',
 12: 'on',
 13: 'are',
 14: 'by',
 15: 'as',
 16: 'an',
 17: 'model',
 18: 'be',
 19: 'from',
 20: 'which',
 21: 'data',
 22: 'can',
 23: 'our',
 24: 'it',
 25: 'at',
 26: 'time',
 27: 'based',
 28: 'these',
 29: 'models',
 30: 'or',
 31: 'using',
 32: 'have',
 33: 'results',
 34: 'show',
 35: 'two',
 36: 'method',
 37: 'between',
 38: 'paper',
 39: 'also',
 40: 'such',
 41: 's',
 42: 'has',
 43: 'not',
 44: 'new',
 45: 'their',
 46: 'study',
 47: 'one',
 48: 'analysis',
 49: 'used',
 50: 'approach',
 51: 'high',
 52: 'its',
 53: 'distribution',
 54: 'methods',
 55: 'problem',
 56: 'different',
 57: 'present',
 58: 'more',
 59: 'both',
 60: 'when',
 61: 'been',
 62: 'than',
 63: 'well',
 64: 'large',
 65: 'number',
 66: 'system',
 67: 'non',
 68: 'function',
 69: 'use',
 70: 'proposed',
 71: 'algorithm',
 72: 'but',
 73: 'process',
 74: 'some',
 75: 'where',

### For the ASRS corpus

In [23]:
#data_ASRS = pd.read_csv("./ASRS_data.csv", sep="|")

In [24]:
#data_ASRS['Narrative'][0]

In [25]:
#s = time.time()
#mydict_ASRS = dictionary()
#dataset_ASRS = process_corpus(data_ASRS['Narrative'], CONTEXT_SIZE, mydict_ASRS)
#t = time.time() - s
#print("Done in {} seconds".format(int(t)))