In [1]:
## Importing packages

import json
import os
import re
import random
import pickle
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from matplotlib import pyplot as plt


In [5]:

with open(os.path.join('output','conference_papers.pkl'),'rb') as openfile:
    conference_papers = pickle.load(openfile)


In [6]:
## Defining regular expression filters for the sentences

import re

def pattern_search(sentence_list, whitespace_separated = False):

    clean_sentence_list = []
    reason_list = []

    p_basic = r"(@x)|(\[\s)|(\{)" # contains @x or [ or {
    p_char = r"[^a-zA-Z\s.\-,()!?\"':;]" # contains anything else than a-z, A-Z, whitespace, - ,()!?"'+:;=
    p_cit = r"\s\w[.]\s" # contains whitespace+letter+period+whitespace
    p_etal = r"et al" # contains et al
    p_link = r"(http)|(www)|([.]com)" # contains http or www or .com
    p_punct = r"[a-zA-Z][.!?\-]"
    p_ieeg = r"\si\.e\.|\se\.g\.|\sfig\s"

    for i, sentence in enumerate(sentence_list):
        var = False
        # can't contain @x or [ or {
        reason = 'p_basic'
        if not re.search(p_basic, sentence):
            reason = 'p_char'
            # can't contain anything else than a-z and few exceptions
            if not re.search(p_char, sentence):
                reason = 'p_cit'
                # can't contain j. citations and other single letter before period
                if not re.search(p_cit, sentence):
                    reason = 'p_etal'
                    # can't contain et al
                    if not re.search(p_etal, sentence):
                        reason = 'p_link'
                        # can't contain link
                        if not re.search(p_link, sentence):
                            reason = 'p_ieg / p_punct'
                            # Filter out letter followed by period
                            if whitespace_separated == False:
                                if not re.search(p_ieeg,sentence):
                                    reason = 'None'
                                    var = True
                            else:
                                if not re.search(p_punct, sentence):
                                    var = True
                                    reason = 'None'
        if var == True:
            clean_sentence_list.append(sentence)
        reason_list.append(reason)

    return clean_sentence_list, reason_list


In [8]:
## Creating sentences dataset

sentence_list = []
reason_list = []
counter = 0

start_time = time.time()
with open(os.path.join('arxiv-dataset',"train.txt"),'r') as scipapers:
    for art in tqdm(scipapers): #203037 papers in total
        article = json.loads(art)
        id = article['article_id']
        text = article['article_text']
        article_sentence_list = []
        if id in conference_papers:
            counter = counter + 1
            article_sentence_list, article_reason_list = pattern_search(text, whitespace_separated = True) # applying regular expressions filters
            sentence_list = sentence_list + article_sentence_list
            reason_list = reason_list + article_reason_list

print(f"From {counter} CS/ML/mathstat papers, the loop collected {len(sentence_list)} sentences in {time.time() - start_time} seconds")


203037it [01:13, 2749.05it/s]From 2700 CS/ML/mathstat papers, the loop collected 153055 sentences in 73.85962152481079 seconds



In [9]:
## Removing double whitespaces and unnecessary whitespaces in front and at the end of sentences

def sentence_filter(sentence_list):
    clean_sentence_list = []
    for i, sent in enumerate(sentence_list):
        # Multiple whitespaces to one
        sent = re.sub(r"\s+",r" ",sent)
        # Clean second part of sentece if letter+whitespace+period is followed by any text
        sent = re.sub(r"(\w\s[.]).+",r"\1",sent)
        # Clean whitespaces before punctuations
        sent = re.sub(r"\s([.,!?:;])",r"\1",sent)
        # Clean start of sentence
        sent = re.sub(r"^[^\w]+",r"",sent)
        # Clean whitespaces before and after hyphen and parentheses
        sent = re.sub(r"([(\-])\s(\w)",r"\1\2",sent)
        sent = re.sub(r"(\w)\s([)\-])",r"\1\2",sent)
        clean_sentence_list.append(sent)
    return clean_sentence_list


In [10]:
## Applying cleaner function on dataset

clean_sentence_list = sentence_filter(sentence_list)

print(len(clean_sentence_list))
print(clean_sentence_list[0:10])


153055
['in the last years wireless communication systems coped with the problem of delivering reliable information while granting high throughput.', 'several works addressed the parallelization of turbo decoder architectures to achieve higher throughput.', 'although throughput and area have been the dominant metrics driving the optimization of turbo decoders, recently, the need for flexible systems able to support different operative modes, or even different standards, has changed the perspective.', 'multi-asip is an effective solution.', 'thus, together with flexible and high throughput processing elements, a multi-asip architecture must feature also a flexible and high throughput interconnection backbone.', 'in this work a general framework to design network on chip based turbo decoder architectures has been presented.', 'the proposed framework can be adapted to explore different topologies, degrees of parallelism, message injection rates and routing algorithms.', 'experimental resu

In [11]:
## 90% mean by token count

from nltk.tokenize import word_tokenize

def keep_mean(clean_sentence_list):
    sentence_len = []
    for i, sentence in enumerate(clean_sentence_list):
        sent = word_tokenize(sentence)
        sentence_len.append([sentence,len(sent)])

    sentence_len = pd.DataFrame(sentence_len, columns = ['sentence','len'])

    sentence_len = sentence_len[(sentence_len.len < sentence_len.len.quantile(0.95)) \
        & (sentence_len.len > sentence_len.len.quantile(0.05))]

    mean_sentence_list = sentence_len['sentence'].tolist()

    return mean_sentence_list


In [12]:
## Applying mean function to dataset

cs_sentence_list = keep_mean(clean_sentence_list)
print(len(cs_sentence_list))


134001


In [13]:
## Writing dataset to pickle

with open(os.path.join('output','cs_conf_sentence_list.pkl'), 'wb') as open_file:
    pickle.dump(cs_sentence_list, open_file)
