## Text Pre-Processing & Feature Generation

In [1]:
#importing required libraries for all tasks
import requests # It also allows you to access the response data of Python.Download pdf using respective links
import re # for working with Regular Expressions.
import pandas as pd # for data manipulation and analysis
import nltk.data # The Natural Language Toolkit used for text processing libraries for tokenization, parsing, etc
import itertools # used chain method from intertools to merge sublists
from nltk.tokenize import RegexpTokenizer # tokenizer uses regex to tokenize
from nltk.probability import FreqDist # used to generate frequency distribution
from nltk.stem import PorterStemmer # used to stem tokens

## Sparse Feature Generation

*  <font size= '3'> **re.sub(x,y,z)**   :   Replaces y with x in z.Operation is done on strings and return value is a string 
* **re.findall(x,y)**   :   Finds x pattern/string in y and returns the matches in a list
* **len()**    :    The len() function returns the number of items in an object.

In [2]:
#function to extract text from pdf
import io # for stream handling

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter #libraries for extracting info from PDF, analysing data etc
from pdfminer.converter import TextConverter #converting PDF files into other text formats
from pdfminer.layout import LAParams #performing layout analysis
from pdfminer.pdfpage import PDFPage # processing the page contents


def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [3]:
# extracting links from pdf
links_text  = convert_pdf_to_txt('paper-ids.pdf') # calling convert_pdf_to_txt function

In [4]:
# pattern to extarct pdf id and respective pdf download link
pattern = re.compile(r'(PP[\d]+.pdf) (https:[\w\W]+?)\n') 

In [5]:
links_list = re.findall(pattern,links_text) #extarct pdf id and respective pdf download link
links_list[0] #item 0

('PP3197.pdf',
 'https://drive.google.com/uc?export=download&id=1z9rts_mqD0rk1cztTomCZEmYu2Xv10iv')

#### Downloading PDF files from links

In [6]:
%%time

#5-6 min runtime for this code

for each in links_list:
    filename,link = each
    r = requests.get(link) # pass each link to requests() method
    with open(filename,'wb') as code:
        code.write(r.content) # write the content into PDF files

In [7]:
%%time

# using convert_pdf_to_txt function to convert all 200 PDF to text. Text is stored in a list of lists

all_doc_list = []
for filename,link in links_list:
    each_doc = convert_pdf_to_txt(filename)
    all_doc_list.append(each_doc)

Wall time: 2min 45s


**Extracting title,abstract,body and authors from all the 200 document sublists**

In [8]:
##  re.compile for compiling pattern into pattern objects for body, title, abstract and authors.
body_pattern = re.compile(r'Paper Body([\W\w]+)[\d]+ References')
title_pattern = re.compile(r'([\W\w\s\S]+?)Authored by:(.*)')
abstract_pattern = re.compile(r'Abstract([\w\W\s\S]+?)Paper Body')
authors_pattern = re.compile(r'Authored by:\n\n([\w\W]+?)\n\nAbstract')
# creating empty lists for body, title, abstract and authors.
all_body_list = []
all_title = []
all_abstract = []
all_authors = []
# extracting body, title, abstract and authors and appending them into a list.
for each in all_doc_list:
    all_body_list.append(re.search(body_pattern,each).group(1))
    all_title.append(re.search(title_pattern,each).group(1))
    all_abstract.append(re.search(abstract_pattern,each).group(1))
    all_authors.append(re.search(authors_pattern,each).group(1))

Before we do the sentence segmentation of the body, we need to clean the body. The below function is defined to clean the body

In [9]:
# function to clean the body
def clean_body(body):
    re_list = ['\n','- ','\?\d\s\?\s','\?\d\s','-\?\s','-?','\?','ﬀ','ﬁ','ﬃ','ﬄ','ﬂ'] # to replace char
    sub_list = [' ','','','','','','\'','ff','fi','ffi','ffl','fl']  # by replace char
    for i in range(0,len(re_list)):
        body = re.sub(re_list[i],sub_list[i],body).strip()
    return body

#### E. Sentence Segmentation of body

In [10]:
# sentence segmentation
all_body_sentence = [] # creating an empty list for all body.
for each_body in all_body_list:
    body = clean_body(each_body)
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # this tokenizer divides a text into a list of sentences
    sentences = sent_detector.tokenize(body) # for converting text strings to streams of token objects
    all_body_sentence.append(sentences)

In [11]:
all_body_sentence[0][0:3]

['Current reinforcementlearning (RL) techniques hold great promise for creating a general type of artificial intelligence (AI), specifically autonomous (software) agents that learn difficult tasks with limited feedback (Sutton & Barto, 1998).',
 'Applied RL has been very successful, producing worldclass computer backgammon players (Tesauro, 1994) and model helicopter flyers (Ng et al., 2003).',
 'Many applications of RL, including the two above, utilize supervisedlearning techniques for the purpose of generalization.']

**Converting first capital letters to lower case for each sentence** <br>
**normalizing to lowercase except the capital tokens appearing in the
middle of a sentence/line**

In [12]:
%%time
# first capital letter to lower()
total_body_sentences = [] # stores all sentences as list of lists
for each in all_body_sentence:
    each_sentence_lower = [] # stores sentences for each body
    for sentence in each:
        sentence = sentence[0].lower()+sentence[1:] # slicing the sentence to lower the first char
        each_sentence_lower.append(sentence)
    total_body_sentences.append(''.join(each_sentence_lower)) 

Wall time: 54 ms


In [13]:
len(total_body_sentences)

200

#### A. Tokenization <br>
**Word tokenization of body using regular expression, r"[A-Za-z]\w+(?:[- '?]\w+)?"**

In [14]:
# from nltk.tokenize import RegexpTokenizer
unigram_tokens= []
for each in total_body_sentences:
    tokenizer = RegexpTokenizer(r"[A-Za-z]\w+(?:[-'?]\w+)?") # regex to tokenize
    unigram_tokens.append(tokenizer.tokenize(each))

In [15]:
unigram_tokens[0][1:5]

['reinforcementlearning', 'RL', 'techniques', 'hold']

#### G. First 200 meaningful bigrams, based on highest total frequency in the corpus

Generating bigrams

In [16]:
# generating bigrams 
all_doc_bigrams = []
for each_doc_token_list in unigram_tokens:
    per_doc_bigrams = list(nltk.bigrams(each_doc_token_list)) # nltk.bigrams() to generate bigrams from an input list
    all_doc_bigrams.append(per_doc_bigrams)
# all_doc_bigrams[0]

In [17]:
all_doc_bigrams[0][1:5]

[('reinforcementlearning', 'RL'),
 ('RL', 'techniques'),
 ('techniques', 'hold'),
 ('hold', 'great')]

Reading stopwords.txt and storing stopwords in a list

In [18]:
#reading the given text file to get context_independent_tokens 
with open('stopwords_en.txt') as stop:
    context_independent_tokens = stop.read().split('\n')
context_independent_tokens[0:5]

['a', "a's", 'able', 'about', 'above']

Removing all the bigrams with Stop words as part of them

In [19]:
%%time
# each_do_bigrams = []
all_doc_bigrams_no_stop = [] # stores all bigrams as list of lists
for each_doc_bigrams in all_doc_bigrams:
    each_do_bigrams = [] # list stores each document bigrams
    for each_bigram in each_doc_bigrams:
        x,y = each_bigram # tuple unpacking (into two unigrams)
        if x.lower() in context_independent_tokens or y.lower() in context_independent_tokens:
            pass
        else:
            bigram_new = x+'__'+y # joining the tokens again with '__' double underscore
            each_do_bigrams.append(bigram_new)
    all_doc_bigrams_no_stop.append(each_do_bigrams)   

Wall time: 7.61 s


In [20]:
all_doc_bigrams_no_stop[0][1:5]

['reinforcementlearning__RL',
 'RL__techniques',
 'techniques__hold',
 'hold__great']

Calculating Freq distribution of bigrams across all 200 docs

In [21]:
# from nltk.probability import FreqDist
all_doc_freq_dist_bigrams = []
for each_doc_list in all_doc_bigrams_no_stop:
    fdbigram = FreqDist(each_doc_list) # using FreqDist() to calculate distribution of each token in the list
    all_doc_freq_dist_bigrams.append(fdbigram)
all_doc_freq_dist_bigrams[0]

FreqDist({'Linear__Regression': 8, 'yi__xi': 7, 'valid__prediction': 7, 'linear__regression': 6, 'parameter__vector': 6, 'high__probability': 6, 'sample__complexity': 6, 'RL__algorithm': 5, 'state__space': 5, 'ith__row': 5, ...})

Using intertools to join all the lists containing freq distributions of bigrams

In [22]:
# import itertools
combined_bigrams = list(itertools.chain.from_iterable(all_doc_freq_dist_bigrams)) #combine all sublist's into one
combined_bigrams[0:5]

['current__reinforcementlearning',
 'reinforcementlearning__RL',
 'RL__techniques',
 'techniques__hold',
 'hold__great']

In [23]:
# selecting top 200 frequently occuring bigrams in the corpus
top_200_bigrams_dist = FreqDist(combined_bigrams).most_common(200) #using FreqDist() to calculate dist of each token in the list
top_200_bigrams_dist[0:5]

[('machine__learning', 81),
 ('figure__shows', 71),
 ('optimization__problem', 71),
 ('future__work', 66),
 ('Processing__Systems', 63)]

In [24]:
# extracting top 200 bigram words from the freq distribution list
top_200_bigrams = []
for i in range(0,len(top_200_bigrams_dist)):
    x,y = top_200_bigrams_dist[i]
    top_200_bigrams.append(x)   
top_200_bigrams[0:5]

['machine__learning',
 'figure__shows',
 'optimization__problem',
 'future__work',
 'Processing__Systems']

#### B. The context-independent and context-dependent (with the threshold set to %95) stop words to be removed from the vocab.

Removing StopWords from unigram tokens across all 200 sublists

In [25]:
# removing context independent stopwords from unigram tokens
unigram_tokens_no_stop = []
for each_unigram_list in unigram_tokens:
    # 
    stopped_tokens = [w for w in each_unigram_list if w.lower() not in context_independent_tokens] # iterating across sublists
    unigram_tokens_no_stop.append(stopped_tokens)

In [26]:
unigram_tokens_no_stop[0][1:5]

['reinforcementlearning', 'RL', 'techniques', 'hold']

Extracting unique tokens from each sublist into a new list of sublists

In [27]:
# extarcting unique tokens from unigram_tokens_no_stop for each document
unigram_tokens_no_stop_unique = []
for each_unigram_list in unigram_tokens_no_stop:
    unigram_tokens_no_stop_unique.append(list(set(each_unigram_list)))
unigram_tokens_no_stop_unique[0][1:5]

['utilize', 'computational', 'normconstrained', 'development']

Combining all the unique unigrams from all 200 sublists into a single list

In [28]:
combined_unigram_tokens = list(itertools.chain.from_iterable(unigram_tokens_no_stop_unique))

In [29]:
len(combined_unigram_tokens) # total unigram tokens

156140

Removing unigrams that occur in more than 95% of the documents

In [30]:
threshold = 0.95*200 # 200 is the total documents, threshold=190
more_than_95_unigm = []
less_than_95_unigm = []
for each in FreqDist(combined_unigram_tokens).items():
    x,y = each
    if y>threshold:
        more_than_95_unigm.append(each) # list contains unigrams present in more than 95% of documents
    else:
        less_than_95_unigm.append(each) # list contains unigrams present less than 95% of documents
less_than_95_unigm[0:5]

[('shown', 169),
 ('utilize', 21),
 ('computational', 110),
 ('normconstrained', 1),
 ('development', 36)]

Extracting unique bigrams from each sublist into a new list of sublists

In [31]:
# extarcting unique tokens from all_doc_bigrams_no_stop for each document
bigrams_tokens_no_stop_unique = []
for each_bigram_list in all_doc_bigrams_no_stop:
    bigrams_tokens_no_stop_unique.append(list(set(each_bigram_list)))

Combining all the unique bigrams from all 200 sublists into a single list

In [32]:
combined_bigram_tokens = list(itertools.chain.from_iterable(bigrams_tokens_no_stop_unique))

Removing bigrams that occur in more than 95% of the documents

In [33]:
# removing bigrams that occur in more than 95% of the documents
threshold = 0.95*200 # 200 is the total documents, threshold=190
more_than_95_bigm = []
less_than_95_bigm = []
for each in sorted(FreqDist(combined_bigram_tokens).items(),key=lambda x:x[1],reverse=True):
    x,y = each
    if y>threshold:
        more_than_95_bigm.append(each) # list contains bigrams present in more than 95% of documents
    else:
        less_than_95_bigm.append(each)   # list contains bigrams present in less than 95% of documents
less_than_95_bigm[0:5]

[('machine__learning', 81),
 ('figure__shows', 71),
 ('optimization__problem', 71),
 ('future__work', 66),
 ('Processing__Systems', 63)]

#### D. Rare tokens (with the threshold set to 3%) must be removed from the vocab. 

In [34]:
# removing unigrams that occur in less than 3% of the documents
threshold = 0.03*200 # 200 is the total documents, threshold=6
less_than_3_unigm = []
more_than_3_unigm = []
for each in less_than_95_unigm:
    x,y = each
    if y<threshold:
        less_than_3_unigm.append(each) # list contains unigrams present in less than 3% of documents
    else:
        more_than_3_unigm.append(each) # list contains unigrams present in more than 3% of documents
more_than_3_unigm[0:5]

[('shown', 169),
 ('utilize', 21),
 ('computational', 110),
 ('development', 36),
 ('Markov', 53)]

In [35]:
# removing bigrams that occur in less than 3% of the documents
threshold = 0.03*200 # 200 is the total documents, threshold=6
less_than_3_bigm = []
more_than_3_bigm = []
for each in less_than_95_bigm:
    x,y = each
    if y<threshold:
        less_than_3_bigm.append(each) # list contains bigrams present in less than 3% of documents
    else:
        more_than_3_bigm.append(each)  # list contains bigrams present in more than 3% of documents
more_than_3_bigm[0:5]

[('machine__learning', 81),
 ('figure__shows', 71),
 ('optimization__problem', 71),
 ('future__work', 66),
 ('Processing__Systems', 63)]

Combining all the unigrams and bigrams after satisfying the above conditions

In [36]:
# generating final vocabulary with unigrams
first_vocab_unigrams = []
for each in more_than_3_unigm:
    x,y=each
    first_vocab_unigrams.append(x)
first_vocab_unigrams[0:5]

['shown', 'utilize', 'computational', 'development', 'Markov']

In [37]:
first_vocab_bigrams = []
for each in more_than_3_bigm:
    bigram,freq = each
    if bigram in top_200_bigrams:
        first_vocab_bigrams.append(bigram)
    else:
        pass
first_vocab_bigrams[0:5]

['machine__learning',
 'figure__shows',
 'optimization__problem',
 'future__work',
 'Processing__Systems']

Combining the lists into one

In [38]:
vocab_initial = []
vocab_initial.append(first_vocab_unigrams)
vocab_initial.append(first_vocab_bigrams)
first_vocab_combined = list(itertools.chain.from_iterable(vocab_initial))
first_vocab_combined[0:5]

['shown', 'utilize', 'computational', 'development', 'Markov']

In [39]:
len(first_vocab_combined)

4734

#### F. Tokens with the length less than 3 should be removed from the vocab.  

In [40]:
second_vocab = []
for each_token in first_vocab_combined:
    if len(each_token)<3: # checking tokens with length less than 3
        pass
    else:
        second_vocab.append(each_token) # tokens with len>=3
second_vocab[0:5]

['shown', 'utilize', 'computational', 'development', 'Markov']

In [41]:
len(second_vocab)

4408

 #### C. Unigram stemming using the Porter stemmer

In [42]:
# from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
pattern = re.compile('[\w]+__[\w]+') # this pattern will match bigrams from input list
vocab = []
for each in second_vocab:
    if re.search(pattern,each):
        vocab.append(each)
    else:
        vocab.append(stemmer.stem(each)) # stem unigrams from 
vocab = list(set(vocab)) # taking unique values into a list
vocab = sorted(vocab,key=str) # sorting the vocab words
vocab[0:6] # display top 6

['Barcelona__Spain',
 'Beach__CA',
 'CA__USA',
 'Figure__Comparison',
 'Figure__Left',
 'Figure__shows']

In [43]:
len(vocab) # total words in vocab (unigrams + bigrams)

2343

#### Generating Vocabulary index file <br>
**Format:** token_string:token_index

In [78]:
with open('vocab.txt','w+',encoding='utf-8') as textfile:
    for i in range(0,len(vocab)):
        textfile.write('{}:{}'.format(vocab[i],i))
        textfile.write('\n')

#### Generating Sparse count vectors file

For generating Sparse count vector we require all the unigrams and bigrams from a particular document to be together.So the below code creates a list of lists to store all uni and bi grams for each document

In [45]:
print(all_doc_bigrams_no_stop[0][0:5])
print(unigram_tokens_no_stop[0][0:5])

['current__reinforcementlearning', 'reinforcementlearning__RL', 'RL__techniques', 'techniques__hold', 'hold__great']
['current', 'reinforcementlearning', 'RL', 'techniques', 'hold']


In [46]:
# making a combined list of unigrams and bigrams for all 200 documents
# import itertools
uni_bi_zipped_list = []
uni_bi_grams_list = [] # to store all uni and bi grams for all 200 docs as sublists
for i in range(200):
    uni_bi_zipped_list.append(unigram_tokens_no_stop[i]) # extarcting unigrams from each sublist
    uni_bi_zipped_list.append(all_doc_bigrams_no_stop[i]) # extarcting bigrams from each sublist
    tokens_combined = list(itertools.chain.from_iterable(uni_bi_zipped_list)) # combining uni and bi grams for each sublist(doc)
    uni_bi_zipped_list = []
    uni_bi_grams_list.append(tokens_combined)
    
# generating freq distribution for each sublist of uni_bi_grams_list     
uni_bi_grams_list_freq = []
for i in range(len(uni_bi_grams_list)):
    freq = FreqDist(uni_bi_grams_list[i])
    uni_bi_grams_list_freq.append(freq)
uni_bi_grams_list_freq[0]

FreqDist({'algorithm': 53, 'state': 36, 'xt': 22, 'input': 19, 'RL': 17, 'prediction': 17, 'number': 16, 'linear': 16, 'problem': 16, 'MDP': 16, ...})

Extracting 200 document id's 

In [47]:
pdf_doc_ids = []
for each in links_list:
    doc_id,link = each
    pdf_doc_ids.append(doc_id[0:6])  
pdf_doc_ids[0:5]

['PP3197', 'PP3234', 'PP3244', 'PP3252', 'PP3389']

Writing into sparse count vectors textfile: <br>
**Format :** paper_id, token1_index:token1_wordcount, token2_index:token2_wordcount

In [79]:
%%time

with open('count_vectors.txt','w+') as count_vector_file:
    for i in range(200): # i used to iterate through paper_id and uni_bi_grams_list_freq
        paper_id = pdf_doc_ids[i]
        count_vector_file.write(paper_id) # writing paper_id to the document
        for each_token in vocab: # comparing each token vocab with tokens from each document
            if each_token in list(uni_bi_grams_list_freq[i].keys()): # checking in tokens of each document
                freq_each_token = uni_bi_grams_list_freq[i][each_token]
                index_of_each_token = vocab.index(each_token)
                count_vector_file.write(',{}:{}'.format(index_of_each_token,freq_each_token)) # token1_index:token1_wordcount
        count_vector_file.write('\n')    

Wall time: 16.9 s


## Statistics Generation

For Abstracts, Tokens must be normalized to lowercase except the capital tokens appearing in the middle of a sentence/line. (use sentence segmentation to achieve this). For Titles, tokens must be all normalised to lowercase

In [49]:
all_abstract[0]

'\n\nWe provide a provably eﬃcient algorithm for learning Markov Decision\nProcesses (MDPs) with continuous state and action spaces in the online\nsetting. Speciﬁcally, we take a model-based approach and show that a\nspecial type of online linear regression allows us to learn MDPs with (pos-\nsibly kernalized) linearly parameterized dynamics. This result builds on\nKearns and Singh’s work that provides a provably eﬃcient algorithm for\nﬁnite state MDPs. Our approach is not restricted to the linear setting,\nand is applicable to other classes of continuous MDPs.\n\n1 '

#### First we clean the abstarcts by iterating through the entitre list.

In [50]:
# abstarct cleaning
all_clean_abstract=[]
for each in all_abstract:
    each = re.sub('\n1','',each) # substituting 
    each = re.sub('\n\n','',each)
    each = re.sub('\n',' ',each).strip()
    all_clean_abstract.append(each)   

In [51]:
all_clean_abstract[0]

'We provide a provably eﬃcient algorithm for learning Markov Decision Processes (MDPs) with continuous state and action spaces in the online setting. Speciﬁcally, we take a model-based approach and show that a special type of online linear regression allows us to learn MDPs with (pos- sibly kernalized) linearly parameterized dynamics. This result builds on Kearns and Singh’s work that provides a provably eﬃcient algorithm for ﬁnite state MDPs. Our approach is not restricted to the linear setting, and is applicable to other classes of continuous MDPs.'

#### Sentence Segmentation

In [52]:
#sentence segmentation
all_abstract_sentence = []
for each_abstract in all_clean_abstract:
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') # this tokenizer divides a text into a list of sentences
    sentences = sent_detector.tokenize(each_abstract) # for converting text strings to streams of token objects
    all_abstract_sentence.append(sentences)
all_abstract_sentence[0]

['We provide a provably eﬃcient algorithm for learning Markov Decision Processes (MDPs) with continuous state and action spaces in the online setting.',
 'Speciﬁcally, we take a model-based approach and show that a special type of online linear regression allows us to learn MDPs with (pos- sibly kernalized) linearly parameterized dynamics.',
 'This result builds on Kearns and Singh’s work that provides a provably eﬃcient algorithm for ﬁnite state MDPs.',
 'Our approach is not restricted to the linear setting, and is applicable to other classes of continuous MDPs.']

**Converting first capital letters to lower case for each sentence** <br>
**normalizing to lowercase except the capital tokens appearing in the
middle of a sentence/line**

In [53]:
%%time
# first capital letter to lower()
total_abstract_sentences = []
for each in all_abstract_sentence: # interating through sublists
    each_sentence_lower = []
    for sentence in each: # interating through sentences in each sublist
        sentence = sentence[0].lower()+sentence[1:] # using string slicing to lower case the first char
        each_sentence_lower.append(sentence)
    total_abstract_sentences.append(''.join(each_sentence_lower)) # appending sentences to a list as a string

Wall time: 2.99 ms


In [54]:
total_abstract_sentences[0]

'we provide a provably eﬃcient algorithm for learning Markov Decision Processes (MDPs) with continuous state and action spaces in the online setting.speciﬁcally, we take a model-based approach and show that a special type of online linear regression allows us to learn MDPs with (pos- sibly kernalized) linearly parameterized dynamics.this result builds on Kearns and Singh’s work that provides a provably eﬃcient algorithm for ﬁnite state MDPs.our approach is not restricted to the linear setting, and is applicable to other classes of continuous MDPs.'

In [55]:
all_title[0]

'Online Linear Regression and Its Application to\n\nModel-Based Reinforcement Learning\n\n'

#### First we clean the titles by iterating through the entitre list.

In [56]:
all_clean_title=[]
for each in all_title:
    each = re.sub('\n\n',' ',each)
    each = re.sub('\n','',each).strip()
    all_clean_title.append(each)

In [57]:
all_clean_title[0]

'Online Linear Regression and Its Application to Model-Based Reinforcement Learning'

**Clean the titles by iterating through the entitre list**

In [58]:
all_authors[0]

'Michael L. Littman\nAlexander L. Strehl'

In [59]:
all_authors_clean = []
for each in all_authors:
    each = re.sub('\n\n','\n',each).split('\n')
    all_authors_clean.append(each)
all_authors_clean[0]

['Michael L. Littman', 'Alexander L. Strehl']

**Tokenizing abstarcts in each 200 sublist's** 

In [60]:
# from nltk.tokenize import RegexpTokenizer
abstract_tokens= []
for each in total_abstract_sentences:
    tokenizer = RegexpTokenizer(r"[A-Za-z]\w+(?:[-'?]\w+)?")
    abstract_tokens.append(tokenizer.tokenize(each))
abstract_tokens[1][0:5]

['guided', 'by', 'the', 'goal', 'of']

**Tokenizing title's in each 200 sublist's** 

In [61]:
# from nltk.tokenize import RegexpTokenizer
title_tokens= []
for each in all_clean_title:
    tokenizer = RegexpTokenizer(r"[A-Za-z]\w+(?:[-'?]\w+)?")
    title_tokens.append(tokenizer.tokenize(each.lower()))
title_tokens[1]

['topmoumoute', 'online', 'natural', 'gradient', 'algorithm']

**Removing context independent stop words from abstarct and title**

In [62]:
abstract_tokens_no_stop = []
for each_abstract_list in abstract_tokens:
    stopped_tokens = [w for w in each_abstract_list if w.lower() not in context_independent_tokens]
    abstract_tokens_no_stop.append(stopped_tokens)

In [63]:
abstract_tokens_no_stop[0][0:5]

['provide', 'provably', 'eﬃcient', 'algorithm', 'learning']

In [64]:
title_tokens_no_stop = []
for each_title_list in title_tokens:
    stopped_tokens = [w for w in each_title_list  if w.lower() not in context_independent_tokens]
    title_tokens_no_stop.append(stopped_tokens)
title_tokens_no_stop[0][0:5]

['online', 'linear', 'regression', 'application', 'model-based']

**Now we combine all the abstarct tokens, title tokens and authors into three lists respectively**

In [65]:
combined_abstract_tokens = list(itertools.chain.from_iterable(abstract_tokens_no_stop))
combined_abstract_tokens[0:4]

['provide', 'provably', 'eﬃcient', 'algorithm']

In [66]:
combined_title_tokens = list(itertools.chain.from_iterable(title_tokens_no_stop))
combined_title_tokens[0:4]

['online', 'linear', 'regression', 'application']

In [67]:
combined_authors = list(itertools.chain.from_iterable(all_authors_clean))
combined_authors[0:3]

['Michael L. Littman', 'Alexander L. Strehl', 'Yoshua Bengio']

##### Final Tasks: <br>
* Creating a freq distribution disctionaries for abstarct tokens, title tokens and author's, using **FreqDist()** method
* Extracting all the items from the respective dictionaries and converting them into a list, using **.items()**
* sort the list using **sorted()** method.
* first decreasing sort for frequencies (-x[1]) and then increasing sort for tokens (x[0])
* selecting top 10 from the sorted list by slicing through the list ([0:10])

In [68]:
combined_abstract_tokens_sorted = [x for x,y in sorted(list(FreqDist(combined_abstract_tokens).items()),key=lambda x: (-x[1],x[0]),reverse=False)[0:10]]
combined_abstract_tokens_sorted

['learning',
 'data',
 'model',
 'algorithm',
 'problem',
 'show',
 'method',
 'based',
 'methods',
 'problems']

In [69]:
combined_title_tokens_sorted = [x for x,y in sorted(list(FreqDist(combined_title_tokens).items()),key=lambda x: (-x[1],x[0]),reverse=False)[0:10]]
combined_title_tokens_sorted

['learning',
 'models',
 'networks',
 'deep',
 'model',
 'data',
 'estimation',
 'regression',
 'sparse',
 'visual']

In [70]:
combined_authors_sorted = [x for x,y in sorted(list(FreqDist(combined_authors).items()),key=lambda x: (-x[1],x[0]),reverse=False)[0:10]]
combined_authors_sorted

['Ambuj Tewari',
 'Tong Zhang',
 'Trevor Darrell',
 'Alex J. Smola',
 'Bertrand Thirion',
 'Pradeep K. Ravikumar',
 'Remi Munos',
 'Richard Zemel',
 'Tamir Hazan',
 'Alessandro Lazaric']

#### Creating a dataframe <br>
##### Tasks :
* Create an empty data frame with three columns['top10_terms_in_abstracts','top10_terms_in_titles','top10_authors']
* Insert the top 10 values from the lists into each column according to the label respectively

In [71]:
df = pd.DataFrame(columns=['top10_terms_in_abstracts','top10_terms_in_titles','top10_authors'])

In [72]:
df['top10_terms_in_abstracts']=combined_abstract_tokens_sorted

In [73]:
df['top10_terms_in_titles']=combined_title_tokens_sorted

In [74]:
df['top10_authors']=combined_authors_sorted

In [75]:
df # data frame

Unnamed: 0,top10_terms_in_abstracts,top10_terms_in_titles,top10_authors
0,learning,learning,Ambuj Tewari
1,data,models,Tong Zhang
2,model,networks,Trevor Darrell
3,algorithm,deep,Alex J. Smola
4,problem,model,Bertrand Thirion
5,show,data,Pradeep K. Ravikumar
6,method,estimation,Remi Munos
7,based,regression,Richard Zemel
8,methods,sparse,Tamir Hazan
9,problems,visual,Alessandro Lazaric


Writing the dataframe into a csv file using **dataframe.to_csv()**

In [82]:
df.to_csv('stats.csv',index=None,header=True)  # generating a CSV file from a data frame using .to_csv()

## References <br>
* Pandas 0.25.1 documentation https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
* re - Regular Expression Opeartions https://docs.python.org/2/library/re.html
* re - Regular Expression Operations https://docs.python.org/3/library/re.html
