# Import Dependencies

In [None]:
from nltk.stem.snowball import SnowballStemmer
import pandas as pd
import PyPDF2 as pdf
import re
import spacy

# Intializing file variables

In [36]:
file_1 = 'Solverminds_Data/Sample/001.pdf'
file_2 = 'Solverminds_Data/Sample/002.pdf'

# Processing PDF files

PyPDF2 is a pure-python PDF library capable of splitting, merging together, cropping, and transforming the pages of PDF files. It can also add custom data, viewing options, and passwords to PDF files. It can retrieve text and metadata from PDFs as well as merge entire files together. Here we used PyPDF2 to handel data from pdf files.
Next we use python 're' module, stands for regular expression to clean the text obtained from pdf files. In the function below regular expressions are used to remove escape sequences, white spaces and punctuations. We use simple replace function of strin object to remove new lines from the text.
Finally the function returns cleaned text.

In [34]:
def processPdf(file):
    filereader = pdf.PdfFileReader(open(file, 'rb'))
    count = 0
    while count < filereader.getNumPages():
        pageObj = filereader.getPage(count)
        count += 1
        text = pageObj.extractText()

        #Punctuation removal
        processed = re.sub(r'[^\w\s]', ' ', text)
        processed = processed.replace('\n', ' ')
        processed = re.sub(' +', ' ', processed)
        processed = processed.strip()
    return processed

# Creating instances of spaCy and NLTK models

In [45]:
nlp = spacy.load('en_core_web_sm')
stemmer = SnowballStemmer(language='english')
doc = nlp(processPdf(file_1))

# Tokenization
The next step in our pipeline is to break this sentence into separate words or tokens. This is called tokenization. Tokenization is easy to do in English. We’ll just split apart words whenever there’s a space between them. And we’ll also treat punctuation marks as separate tokens since punctuation also has meaning.

In [46]:
word_tokens = [token.text for token in doc]

# Stopwords Removal
Next, we want to consider the importance of each word in the sentence. English has a lot of filler words that appear very frequently like “and”, “the”, and “a”. When doing statistics on text, these words introduce a lot of noise since they appear way more frequently than other words. Some NLP pipelines will flag them as stop words —that is, words that we might want to filter out before doing any statistical analysis.

In [None]:
#Stopwords Removal
word_tokens_stop = [token.text for token in doc if not token.is_stop]

# Stemming and Lemmatization
For grammatical reasons, documents are going to use different forms of a word, such as organize, organizes, and organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy, democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these words to return documents that contain another word in the set.

The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form.

In [48]:
#Stemming
word_tokens_stem = [stemmer.stem(token.text) for token in doc if not token.is_stop]

In [49]:
#Lemmatization
word_tokens_lemma = [token.lemma_ for token in doc if not token.is_stop]

In [50]:
print('\n')





# TF-IDF Vectorization
tf-idf stands for Term frequency-inverse document frequency. The tf-idf weight is a weight often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus. The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus (data-set).

tf-idf is a weighting scheme that assigns each term in a document a weight based on its term frequency (tf) and inverse document frequency (idf). The terms with higher weight scores are considered to be more important.

Typically, the tf-idf weight is composed by two terms-

Normalized Term Frequency (tf)
Inverse Document Frequency (idf)

###### Term Frequency (tf):
Gives us the frequency of the word in each document in the corpus. It is the ratio of number of times the word appears in a document compared to the total number of words in that document. It increases as the number of occurrences of that word within the document increases. Each document has its own tf.
$$tf_{i,j}=\frac{n_{i,j}}{\sum_k n_{i,j}}$$

###### Inverse Data Frequency (idf):
Used to calculate the weight of rare words across all documents in the corpus. The words that occur rarely in the corpus have a high IDF score. It is given by the equation below.
$$idf(w) = log\left (\frac{N}{df_t}  \right )$$


Combining these two we come up with the TF-IDF score (w) for a word in a document in the corpus. It is the product of tf and idf:
$$w_{i,j} = tf_{i,j} \times log\left (\frac{N}{df_i}  \right )$$

$$tf_{i,j} = \textrm{number of occurences of i in j}$$
$$df_i = \textrm{number of documents conatining i}$$
$$N = \textrm{Toat number of documents}$$

In [38]:
docA = processPdf(file_1)
docB = processPdf(file_2)
docB = "the dog sat on my bed"

bowA = docA.split(" ")
bowB = docB.split(" ")

wordSet = set(bowA).union(set(bowB))

wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

for word in bowA:
    wordDictA[word] += 1
    
for word in bowB:
    wordDictB[word] +=1

The function 'computeTF' computes the TF score for each word in the corpus, by document.

In [39]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
    return tfDict

In [40]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

The function 'computeIDF' computes the IDF score of every word in the corpus.

In [29]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    #counts the number of documents that contain a word w
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
                
    #Divide N by denominator above, take the log of that
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
        
    return idfDict

In [41]:
idfs = computeIDF([wordDictA, wordDictB])

The function 'computeTFIDF' below computes the TF-IDF score for each word, by multiplying the TF and IDF scores.

In [42]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [43]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

### The output produced by the above code for the set of documents file_1 and file_2:

In [44]:
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,are,Americas,periodically,properly,Republic,ensure,sat,U,concerned,concerning,...,Consular,Avenue,June,format,4,Finance,will,considered,and,2
0,0.003094,0.003094,0.003094,0.003094,0.003094,0.003094,0.0,0.003094,0.003094,0.003094,...,0.012378,0.003094,0.003094,0.003094,0.003094,0.003094,0.006189,0.003094,0.040227,0.009283
1,0.0,0.0,0.0,0.0,0.0,0.0,0.115525,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
