In [1]:
import requests

In [2]:
req = requests.get('http://proceedings.mlr.press/v70/')

In [3]:
req.status_code

200

In [4]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(req.content, 'html.parser')
soup.prettify()[:100]

'<!DOCTYPE html>\n<html lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="IE=edge" http-eq'

In [5]:
s = soup.main
div_wrapper = s.find('div', class_ = "wrapper")

In [6]:
# for each div corresponding to a paper
pdf_links = []
for div_paper in div_wrapper.find_all('div', class_ = 'paper'):
    links = div_paper.find('p', class_ = "links")
    # Get the pdf
    a = links.find_all('a')
    # Save the links ending in .pdf
    for content in a:
        # Get the link
        href = content.get('href')
        # Check if it is a pdf
        if href.find('.pdf') != -1:
            pdf_links.append(href)
    

In [7]:
pdf_links[:10]

['http://proceedings.mlr.press/v70/achab17a/achab17a.pdf',
 'http://proceedings.mlr.press/v70/achab17a/achab17a-supp.pdf',
 'http://proceedings.mlr.press/v70/acharya17a/acharya17a.pdf',
 'http://proceedings.mlr.press/v70/acharya17a/acharya17a-supp.pdf',
 'http://proceedings.mlr.press/v70/achiam17a/achiam17a.pdf',
 'http://proceedings.mlr.press/v70/achiam17a/achiam17a-supp.pdf',
 'http://proceedings.mlr.press/v70/agarwal17a/agarwal17a.pdf',
 'http://proceedings.mlr.press/v70/agarwal17a/agarwal17a-supp.pdf',
 'http://proceedings.mlr.press/v70/akrour17a/akrour17a.pdf',
 'http://proceedings.mlr.press/v70/aksoylar17a/aksoylar17a.pdf']

These are the scraped pdf links from http://proceedings.mlr.press/v70/.

There are about 722 of them

In [8]:
import PyPDF2 as ppdf
from io import BytesIO

Algorithm to extract text from a pdf link
```
from io import BytesIO
req = requests.get(pdf_links[0])
with BytesIO(req.content) as data:
    read_pdf = ppdf.PdfFileReader(data)
    for page in range(read_pdf.getNumPages()):
        read_pdf.getPage(page).extractText()
```

Now do this in a for each pdf link

In [9]:
giant_string = ""
count = 0
pdfs = []
for pdf_link in pdf_links:
    with requests.get(pdf_link) as req:
        with BytesIO(req.content) as data:
            try:
                read_pdf = ppdf.PdfFileReader(data)
                for page in range(read_pdf.getNumPages()):
                    giant_string += " " + read_pdf.getPage(page).extractText()
                pdfs.append(read_pdf)
            except:
                print(data)
    count += 1
    if count == 10:
        break

Some links return a 404 error thus creating an exception when trying to read a pdf.

In [10]:
from collections import Counter
giant_string = giant_string.replace("\n", " ")
split_it = giant_string.split()

counter = Counter(split_it)

top_10_words = counter.most_common(10)
top_10_words

[('the', 2928),
 ('of', 1699),
 ('and', 941),
 ('to', 823),
 ('a', 784),
 ('is', 671),
 ('in', 645),
 ('for', 638),
 ('that', 582),
 ('we', 428)]

These are the 10 most frequently used words from the scraped pdfs.

counter is a dictionary-like object mapping words to their frequencies, which we will use later to estimate pmfs.

In [11]:
# Make a Dataframe from the words
counts = {}
in_order =  counter.most_common(len(counter))
total_words = 0
for pair in in_order:
    counts[pair[0]] = [pair[1]]
    total_words += pair[1]

In [12]:
import pandas as pd
df = pd.DataFrame(counts)
df.head()

Unnamed: 0,the,of,and,to,a,is,in,for,that,we,...,"2013-ST-061-ED0001,",contract,N00014-13-C-0288.,views,clusions,document,interpreted,"implied,","NSF,",AF.
0,2928,1699,941,823,784,671,645,638,582,428,...,1,1,1,1,1,1,1,1,1,1


A dataframe representation of the words with their number of appearances

In [29]:
# Remove every word that has only one occurrence
rem = []
for word in df:
    if df[word][0] == 1 or "," in word or ")" in word or "(" in word:
        rem.append(word)
df = df.drop(labels = [word for word in rem],axis = 1)

In lecture, prof. said to remove all words with frequency one, likely getting rid of words that arent actually words (equations).

In [30]:
for word in df:
    num = df[word][0]  #DF[COLUMN][ROW]
    df[word] = num / (total_words)
word_probs = df.iloc[0].to_numpy() #an array of probabilities
df.head()

Unnamed: 0,the,of,and,to,a,is,in,for,that,we,...,plot,November,spatial,Scan,statistic.,Nisheeth,CCF:,Grant,U.S.,ONR
0,1.873802e-11,1.087291e-11,6.02202e-12,5.266868e-12,5.017283e-12,4.294129e-12,4.12774e-12,4.082942e-12,3.724565e-12,2.739027e-12,...,1.279919e-14,1.279919e-14,1.279919e-14,1.279919e-14,1.279919e-14,1.279919e-14,1.279919e-14,1.279919e-14,1.279919e-14,1.279919e-14


We have now mofied the table to hold the probability of seeing a particular word over the whole set.

I.e. marginal distribution

In [31]:
import numpy as np
import scipy as sci
from scipy.stats import entropy

In [32]:
 # Create a distribution for choosing a random word from a random paper, i.e. choose a random paper then a random word
Z = []
for pdf in pdfs:
    curr_paper = ""
    for page in range(pdf.getNumPages()):
        curr_paper += " " + pdf.getPage(page).extractText()
    curr_paper = curr_paper.replace("\n", " ").split()
    word_count = Counter(split_it)
    for word in word_count:
        word_count[word] /= (len(curr_paper) * len(pdfs))
        Z.append(word_count[word])
entropy(Z, base = 2)

13.991839289977646

This is the entropy for the random variable Z; This conveys a great deal up surprise

In [33]:
df.columns

Index(['the', 'of', 'and', 'to', 'a', 'is', 'in', 'for', 'that', 'we',
       ...
       'plot', 'November', 'spatial', 'Scan', 'statistic.', 'Nisheeth', 'CCF:',
       'Grant', 'U.S.', 'ONR'],
      dtype='object', length=3968)

In [34]:
word_probs

array([1.87380177e-11, 1.08729140e-11, 6.02202004e-12, ...,
       1.27991924e-14, 1.27991924e-14, 1.27991924e-14])

In [35]:
import random
words = df.columns
random_words_paragraph = random.choices(words, weights = word_probs, k = 50)

In [36]:
for word in random_words_paragraph:
    print(word, end = " ")

R. discarded shows where many node seen in search and Deﬁne exponential a and this FTPL the is of The to but to iterateπ points slowly of we network seen to in Binomial that balanced the partially for the this constraints. 2R2d2 these size more for of can +ij;T;H sizes 

This is pretty neat; answer for problem 2.3