In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import glob
import re
import os
from io import StringIO
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
import PyPDF2
def get_pdf_file_content(path_to_pdf):
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    resource_manager = PDFResourceManager(caching=True)
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)
    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=10000000, password="", caching=True, check_extractable=False):
        interpreter.process_page(page)
    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()
    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

path_to_pdf = "/Users/alisdghnia/Desktop/Helium.pdf"
Al=get_pdf_file_content(path_to_pdf)
#print(Al.count("¶"))
#changer = Al.find("¶")
Al = Al.replace("¶a","á")
Al = Al.replace("¶e","é")
Al = Al.replace("¶‡","í")
Al = Al.replace("¶o","ó")
Al = Al.replace("¶u","ú")
Al = Al.replace("~n","ñ")
Al = Al.replace("¶A","Á")
Al = Al.replace("\\",'"')
#print(changer)
#print(Al)
##print(Al.count('\\'))
##print(Al)

words = Al.split()

In [12]:
os.chdir('/Users/alisdghnia/Desktop/PDF Whitepapers Copy/')

In [13]:
names = []
for (dirname, dirs, files) in os.walk('/Users/alisdghnia/Desktop/PDF Whitepapers Copy/'):
    filenames = filter((lambda x: x.endswith('.pdf')), files)
    for filename in filenames:
        name = filename.split('.pdf')[0]
        names.append(name)

names

['WeTrust1',
 'Libra4',
 'B2BX1',
 'Bytom1',
 'Cindicator1',
 'iXledger1',
 'FARAD1',
 'DomRaider1',
 'Zeusshield1',
 'Steem1',
 'Blocknet1',
 'Chronobank1',
 'iExec1',
 'Syscoin1',
 'Gulden1',
 'Winding-Tree2',
 'maymounkov-kademlia-lncs',
 'Cardano1',
 'CoinDash1',
 'Pundi-X1',
 'vSlice1',
 'ODYSSEY1',
 'Iconomi1',
 'Aeron1',
 'Metal1',
 'CoinStarter1',
 'Bancor1',
 'openANX1',
 'Zilliqa2',
 'Rebellious1',
 'Minexcoin1',
 'Numeraire1',
 'Qbao1',
 'Cappasity1',
 'Privatix1',
 'Streamr1',
 'AirSwap1',
 'Avalanche1',
 'ICOS1',
 'RIALTO.AI1',
 'CyberVein1',
 'Enigma1',
 'WaBi1',
 'Cardano3',
 'Winding-Tree1',
 'Synereo1',
 'GATCOIN1',
 'Monaco1',
 'Cardano2',
 'HOQU1',
 'Nebulas1',
 'Oyster1',
 'DigiPulse1',
 'openANX2',
 'Zilliqa1',
 'NAGA1',
 'Earth-Token1',
 'Curriculum-Vitae1',
 'Kin1',
 'Lamden1',
 'Libra2',
 'Blockport1',
 'Ethereum-Classic1',
 'Coinlancer1',
 'Bounty0x1',
 'Genesis-Vision1',
 'MobileGo1',
 'FundYourselfNow1',
 'Opus1',
 'DigitalNote1',
 'CREDITS2',
 'Komodo1',
 '1

In [14]:
corpus = {}
for (dirname, dirs, files) in os.walk('/Users/alisdghnia/Desktop/PDF Whitepapers Copy/'):
    filenames = filter((lambda x: x.endswith('.pdf')), files)
    for filename in filenames:
        name = filename.split('.pdf')[0]
        print(name)
        try:
            text = get_pdf_file_content(filename)
        except Exception:
            pass
        
        text = text.lower()
        corpus[name] = text
        
df = pd.DataFrame(corpus, index = names)
df

WeTrust1
Libra4
B2BX1
Bytom1
Cindicator1
iXledger1
FARAD1
DomRaider1
Zeusshield1
Steem1
Blocknet1
Chronobank1
iExec1
Syscoin1
Gulden1
Winding-Tree2
maymounkov-kademlia-lncs
Cardano1
CoinDash1
Pundi-X1
vSlice1
ODYSSEY1
Iconomi1
Aeron1
Metal1
CoinStarter1
Bancor1
openANX1
Zilliqa2
Rebellious1
Minexcoin1
Numeraire1
Qbao1
Cappasity1
Privatix1
Streamr1
AirSwap1
Avalanche1
ICOS1
RIALTO.AI1
CyberVein1
Enigma1
WaBi1
Cardano3
Winding-Tree1
Synereo1
GATCOIN1
Monaco1
Cardano2
HOQU1
Nebulas1
Oyster1
DigiPulse1
openANX2
Zilliqa1
NAGA1
Earth-Token1
Curriculum-Vitae1
Kin1
Lamden1
Libra2
Blockport1
Ethereum-Classic1
Coinlancer1
Bounty0x1
Genesis-Vision1
MobileGo1
FundYourselfNow1
Opus1
DigitalNote1
CREDITS2
Komodo1
199
COSS1
PayPie1
TrueChain1
Reddcoin1
Dragonchain1
Omni2
VeChain2
VeChain3
Omni3
rfc6330
aelf1
Melon1
Karma1
XPA1
Verge1
SelfKey1
DTR1
EchoLink1
NavCoin1
Libra3
Libra1
Aventus1
Blackmoon1
Verify1
MobileGo2
Civic1
Request-Network1
Experience-Points1
CREDITS1
secure-names-bit-strings
Karma3


Unnamed: 0,WeTrust1,Libra4,B2BX1,Bytom1,Cindicator1,iXledger1,FARAD1,DomRaider1,Zeusshield1,Steem1,...,Sky4,Auctus1,Monetha1,IoT-Chain1,Status1,CPChain1,Presearch2,Electra1,Blockmason1,Nucleus-Vision1
WeTrust1,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
Libra4,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
B2BX1,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
Bytom1,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
Cindicator1,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CPChain1,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
Presearch2,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
Electra1,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...
Blockmason1,wetrust whitepaper table of contents \n\n \n\n...,state machine replication in the libra\n\nbloc...,23\n\n,bytomv1.0 ...,hybrid intelligence for\n\neective asset mana...,\n\nan​ ​insurance​ ​marketplace​ ​based​ ​on...,farad:\n\ncommoditising forward purchase contr...,2\n\nadvance warning\n\nthe initial coin offe...,zeusshield system\n\nzeusshield smart insuranc...,"\n \n\n \n\nsteem \n\nan​ ​incentivized,​ ​b...",...,2016 ieee trustcom/bigdatase/ispa\n2016 ieee t...,\n\n \n \n \n\n \n\n \n\n \n\n \n \n \n \n \...,\n\n \n\n \n\n \n\n \n\nwhite​ ​paper \n\n \n...,\n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \...,the status network\na strategy towards mass ad...,\t\n\ncyber-physical chain (cpchain) whitepape...,whitepaper\n\nthe community-powered search eng...,"white paper\n\nrelease 1.0 / january 31, 2018\...",the credit protocol whitepaper v1.0.1\n\nblock...,“the full potential of connected devices is on...


In [26]:
df1 = pd.DataFrame(df.iloc[0, :])

In [27]:
df1

Unnamed: 0,WeTrust1
WeTrust1,wetrust whitepaper table of contents \n\n \n\n...
Libra4,state machine replication in the libra\n\nbloc...
B2BX1,23\n\n
Bytom1,bytomv1.0 ...
Cindicator1,hybrid intelligence for\n\neective asset mana...
...,...
CPChain1,\t\n\ncyber-physical chain (cpchain) whitepape...
Presearch2,whitepaper\n\nthe community-powered search eng...
Electra1,"white paper\n\nrelease 1.0 / january 31, 2018\..."
Blockmason1,the credit protocol whitepaper v1.0.1\n\nblock...


In [28]:
df1.rename(columns= {'WeTrust1' : 'Text Corpus'})

Unnamed: 0,Text Corpus
WeTrust1,wetrust whitepaper table of contents \n\n \n\n...
Libra4,state machine replication in the libra\n\nbloc...
B2BX1,23\n\n
Bytom1,bytomv1.0 ...
Cindicator1,hybrid intelligence for\n\neective asset mana...
...,...
CPChain1,\t\n\ncyber-physical chain (cpchain) whitepape...
Presearch2,whitepaper\n\nthe community-powered search eng...
Electra1,"white paper\n\nrelease 1.0 / january 31, 2018\..."
Blockmason1,the credit protocol whitepaper v1.0.1\n\nblock...


In [29]:
os.chdir('/Users/alisdghnia/Desktop/')

In [31]:
df1.to_csv('Files Texts Dataframe.csv')