In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import glob
import re
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import TextConverter
from io import StringIO
from pdfminer.pdfpage import PDFPage
def get_pdf_file_content(path_to_pdf):
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''
    resource_manager = PDFResourceManager(caching=True)
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager, out_text, laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)
    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp, pagenos=set(), maxpages=10000000, password="", caching=True, check_extractable=False):
        interpreter.process_page(page)
    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()
    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text

path_to_pdf = "/Users/alisdghnia/Desktop/Helium.pdf"
Al=get_pdf_file_content(path_to_pdf)
#print(Al.count("¶"))
#changer = Al.find("¶")
Al = Al.replace("¶a","á")
Al = Al.replace("¶e","é")
Al = Al.replace("¶‡","í")
Al = Al.replace("¶o","ó")
Al = Al.replace("¶u","ú")
Al = Al.replace("~n","ñ")
Al = Al.replace("¶A","Á")
Al = Al.replace("\\",'"')
#print(changer)
#print(Al)
##print(Al.count('\\'))
##print(Al)

words = Al.split()

In [4]:
def clean_text(text):
    # remove numbers
    text_nonum = re.sub(r'\d+', '', text)
    # remove punctuations and convert characters to lower case
    text_nopunct = "".join([char.lower() for char in text_nonum if char not in string.punctuation]) 
    # substitute multiple whitespace with single whitespace
    # Also, removes leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_no_doublespace

def remove_stops(text, stops):
    text = re.sub(r"AC\/\d{1,4}\/\d{1,4}", "", text)
    words = text.split()
    final = []
    for word in words:
        if word not in stops:
            final.append(word)
    final = " ".join(final)
    final = final.translate(str.maketrans("", "", string.punctuation))
    final = "".join([i for i in final if not i.isdigit()])
    while "  " in final:
        final = final.replace("  ", " ")
    return (final)

In [30]:
Al = remove_stops(Al, stopwords.words('english'))

In [9]:
tfidf_vectorizer = TfidfVectorizer(input = '/Users/alisdghnia/Desktop/Helium.pdf', 
                                   stop_words= 'english',lowercase=True,
                                    max_df=0.8,
                                    min_df=5)

In [10]:
cv = CountVectorizer(input = '/Users/alisdghnia/Desktop/Helium.pdf', 
                                   stop_words= 'english',lowercase=True,
                                    max_df=0.8,
                                    min_df=5)

In [None]:
for (dirname, dirs, files) in os.walk('.'):
    for filename in files:
        if filename.endswith('.pdf'):
            try:
                path_to_pdf = filename
                text = get_pdf_file_content(path_to_pdf)
            except Exception:
                pass
            text = clean_text(text)
            text = remove_stops(text,stopwords.words('english'))
            words = text.split()

In [31]:
tfidf_vectors = tfidf_vectorizer.fit_transform(words)
feature_names = tfidf_vectorizer.get_feature_names()
tfidf_transformer = TfidfTransformer(smooth_idf= True, use_idf=True)
tfidf_transformer.fit(tfidf_vectors)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index = feature_names, columns = ['idf_weights'])
df_idf.sort_values(by=['idf_weights'])



Unnamed: 0,idf_weights
network,5.338182
helium,5.688384
proof,5.817318
miners,5.944317
blockchain,6.164860
...,...
participants,8.716906
participate,8.716906
particular,8.716906
modulation,8.716906


In [24]:
count_vector = cv.fit_transform(words)
tf_idf_vector = tfidf_transformer.transform(count_vector)
feature_names_cv = tfidf_vectorizer.get_feature_names()
first_document_vector = tf_idf_vector[0]

df = pd.DataFrame(first_document_vector.T.todense(), index = feature_names_cv, columns = ['tfidf'])
df.sort_values(by=['tfidf'], ascending=False)



Unnamed: 0,tfidf
helium,1.0
10,0.0
protocol,0.0
quickly,0.0
purpose,0.0
...,...
figure,0.0
fees,0.0
fee,0.0
fault,0.0


In [38]:
os.chdir('/Users/alisdghnia/Desktop/')
df.to_csv('Fuck Me pt 3.csv')

In [39]:
os.chdir('/Users/alisdghnia/Desktop/PDF Whitepapers Copy/')

In [None]:
for (dirname, dirs, files) in os.walk('.'):
    for filename in files:
        if filename.endswith('.pdf'):
            print(filename)
            path_to_pdf = filename
            text = get_pdf_file_content(path_to_pdf)
            text = clean_text(text)
            text = remove_stops(text,stopwords.words('english'))
            words = text.split()
            try:
                tfidf_vectors = tfidf_vectorizer.fit_transform(words)
            except Exception:
                pass
            feature_names = tfidf_vectorizer.get_feature_names()
            tfidf_transformer = TfidfTransformer(smooth_idf= True, use_idf=True)
            tfidf_transformer.fit(tfidf_vectors)
            try:
                count_vector = cv.fit_transform(words)
            except Exception:
                pass
            tf_idf_vector = tfidf_transformer.transform(count_vector)
            feature_names_cv = tfidf_vectorizer.get_feature_names()
            first_document_vector = tf_idf_vector[0]
                
            df_idf = pd.DataFrame(tfidf_transformer.idf_, index = feature_names, columns = ['idf_weights'])
            df_idf.sort_values(by=['idf_weights'])

            df = pd.DataFrame(first_document_vector.T.todense(), index = feature_names_cv, columns = ['tfidf'])
            df.sort_values(by=['tfidf'], ascending=False)

In [None]:
df

Unnamed: 0,tfidf
ability,0.0
able,0.0
access,0.0
account,0.0
accounts,0.0
...,...
whatsoever,0.0
white,0.0
working,0.0
world,0.0


In [47]:
files = [get_pdf_file_content(file) for file in os.listdir('/Users/alisdghnia/Desktop/untitled folder/') if file.endswith('.pdf')] 
tfidf_vectorizer = TfidfVectorizer(use_idf = True) 
tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(files) 
df = pd.DataFrame(tfidf_vectorizer_vectors.T.todense(),
                  index=tfidf_vectorizer.get_feature_names(), 
                  columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

FileNotFoundError: [Errno 2] No such file or directory: 'Creativecoin1 copy.pdf'

In [None]:
for (dirname, dirs, files) in os.walk('.'):
    for filename in files:
        if filename.endswith('.pdf'):
            print(filename)
            path_to_pdf = filename
            text = get_pdf_file_content(path_to_pdf)
            text = clean_text(text)
            text = remove_stops(text,stopwords.words('english'))
            words = text.split()
            try:
                tfidf_vectors = tfidf_vectorizer.fit_transform(words)
            except Exception:
                pass
            feature_names = tfidf_vectorizer.get_feature_names()
            tfidf_transformer = TfidfTransformer(smooth_idf= True, use_idf=True)
            tfidf_transformer.fit(tfidf_vectors)
            try:
                count_vector = cv.fit_transform(words)
            except Exception:
                pass
            tf_idf_vector = tfidf_transformer.transform(count_vector)
            feature_names_cv = tfidf_vectorizer.get_feature_names()
            first_document_vector = tf_idf_vector[0]
                
            df_idf = pd.DataFrame(tfidf_transformer.idf_, index = feature_names, columns = ['idf_weights'])
            df_idf.sort_values(by=['idf_weights'])

            df = pd.DataFrame(first_document_vector.T.todense(), index = feature_names_cv, columns = ['tfidf'])
            df.sort_values(by=['tfidf'], ascending=False)

In [53]:
os.chdir('/Users/alisdghnia/Desktop/untitled folder/')
for (dirname, dirs, files) in os.walk('.'):
    for filename in files:
        if filename.endswith('.pdf'):
            file = get_pdf_file_content(filename)
            file = clean_text(file)
            file = remove_stops(file, stopwords.words('english'))
            word = file.split()
            tfidf_vectorizer = TfidfVectorizer(use_idf = True) 
            tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(word)
            df = pd.DataFrame(tfidf_vectorizer_vectors.T.todense(),
                            index=tfidf_vectorizer.get_feature_names(), 
                            columns=["tfidf"])
            df.sort_values(by=["tfidf"],ascending=False)



ValueError: Shape of passed values is (1099, 2580), indices imply (1099, 1)

In [11]:
os.chdir('/Users/alisdghnia/Desktop/untitled folder/')
for (dirname, dirs, files) in os.walk('.'):
    for filename in files:
        if filename.endswith('.pdf'):
            try:
                file = get_pdf_file_content(filename)
                file = clean_text(file)
                file = remove_stops(file, stopwords.words('english'))
                word = file.split()
                tfidf_vectors = tfidf_vectorizer.fit_transform(word)
                feature_names = tfidf_vectorizer.get_feature_names()
                tfidf_transformer = TfidfTransformer(smooth_idf= True, use_idf=True)
                tfidf_transformer.fit(tfidf_vectors)
                df_idf = pd.DataFrame(tfidf_transformer.idf_, index = feature_names, columns = ['idf_weights'])
                df_idf.sort_values(by=['idf_weights'])
            except Exception:
                pass



In [12]:
df_idf.sort_values(by= 'idf_weights', ascending = False)

Unnamed: 0,idf_weights
goals,7.066495
tolerance,7.066495
trivial,7.066495
larger,7.066495
messages,7.066495
...,...
transactions,5.331894
unl,5.331894
network,4.907010
nodes,4.763910


In [13]:
df = df_idf.reset_index(inplace=True)
#df = df_idf.rename(columns = {'index':'Word'})

In [16]:
df = df_idf

In [21]:
df = df.rename(columns={'index': 'Words'})

In [22]:
df

Unnamed: 0,Words,idf_weights
0,able,6.912344
1,achieve,6.778813
2,agreement,5.526050
3,algorithm,5.862522
4,algorithms,6.555669
...,...,...
89,users,6.661030
90,utility,6.373348
91,values,6.661030
92,votes,6.778813


In [7]:
os.chdir('/Users/alisdghnia/Desktop/untitled folder/')

In [24]:
df.sort_values(by= 'idf_weights', ascending = False).to_csv('Top 10 Cryptocurrency IDF Words with Weights.csv')