## Import all Libraries

In [10]:
import PyPDF2
import nltk
import gspread
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem import WordNetLemmatizer
from gensim.corpora.dictionary import Dictionary
from gensim.models.tfidfmodel import TfidfModel
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/arsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/arsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/arsh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Read the PDF File

In [11]:
file = '/home/arsh/Desktop/intern1/JavaBasics-notes.pdf'
# open allows you to open the files
pdfobj = open(file,'rb')
# pdfReader is a readable object that will be parsed.
pdfReader = PyPDF2.PdfFileReader(pdfobj)
# discerning the number of pages will allow us to parse a number of pages.
num_pages = pdfReader.numPages
count=0
text=''
# the while loop will read each page
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count += 1
    text += pageObj.extractText()


## Convert text into key words

In [12]:
tokens = word_tokenize(text)
tokens[:10]

['Java',
 'Basics©',
 '1996-2003',
 'jGuru.com',
 '.',
 'All',
 'Rights',
 'Reserved.Java',
 'Basics',
 '-1Java']

## Preprocessing on Text 

In [13]:
# convert all words to lower case.
lower_tokens = [t.lower() for t in tokens]
# remove all non-alphanumeric words
alpha_only = [t for t in lower_tokens if t.isalpha()]
# remove all stopwords
no_stops = [t for t in alpha_only if t not in stopwords.words('english')]
# lemmatize all the words.
wrdntlzr = WordNetLemmatizer()
lemmatized = [ wrdntlzr.lemmatize(t) for t in no_stops ]
lemmatized[:10]

['java',
 'right',
 'basic',
 'basicstopics',
 'section',
 'include',
 'make',
 'java',
 'program',
 'portable']

## Create a Simple Bag-Of-Words

In [14]:
bow_simple = Counter(lemmatized)
bow_simple.most_common(20)

[('java', 71),
 ('new', 47),
 ('data', 42),
 ('applet', 37),
 ('object', 37),
 ('button', 36),
 ('array', 30),
 ('class', 29),
 ('int', 29),
 ('code', 27),
 ('method', 27),
 ('string', 26),
 ('b', 26),
 ('basic', 24),
 ('right', 23),
 ('public', 21),
 ('program', 18),
 ('example', 18),
 ('type', 15),
 ('language', 14)]

## using tf-idf with Gensim to find most important words.

In [15]:
# creating a list of pages
list_of_pages=[]
count=0

#extracting all pages and adding as a new item to the list_of_pages
while count < num_pages:
    pageObj = pdfReader.getPage(count)
    count += 1
    list_of_pages += [ pageObj.extractText() ]
    
#lowercasing every word in all the pages
lower_list_of_pages = [ page.lower() for page in list_of_pages]

# word_tokenising every page in the list_of_pages and adding as a list to lop_tokenized
lop_tokenized = [ word_tokenize(t) for t in lower_list_of_pages ]

# initialising a lemmatized_pages_list that will contain the lists of tokenised words after preprocessing
lemmatized_pages_list = []

for t in lop_tokenized:
    alpha_only = [p for p in t if p.isalpha()]
    no_stops = [ p for p in alpha_only if p not in stopwords.words('english') ]
    lemmatized = [ wrdntlzr.lemmatize(p) for p in no_stops ]
    lemmatized_pages_list+=[lemmatized] 
    
# initialising a dictionary object on lemmatized_pages_list
dictionary = Dictionary(lemmatized_pages_list)

# creating a bag-of-words corpus from dictionary for every page in lemmatized_pages_list
corpus = [ dictionary.doc2bow(doc) for doc in lemmatized_pages_list]

# instantiating an tfidf object
tfidf = TfidfModel(corpus)

# making a new dictionary that would contain all the important words along with their weights
dic = {}
for doc in corpus:
    tfidf_weight = tfidf[doc]
    sorted_tfidf_weights = sorted(tfidf_weight, key=lambda w: w[1], reverse=True)
    # Print the top 5 weighted words
    for term_id, weight in sorted_tfidf_weights:
        dic[dictionary.get(term_id)]= weight

# creating a list of tuples from the dictionary in descending order of their values.
sorted_by_value = sorted(dic.items(), key=lambda kv: kv[1],reverse = True)
sorted_by_value[:10]

[('reserved', 0.9402500069807611),
 ('comment', 0.7277421278035406),
 ('data', 0.6013084800711328),
 ('blank', 0.5),
 ('intentionally', 0.5),
 ('left', 0.5),
 ('page', 0.5),
 ('bit', 0.4511811739960831),
 ('label', 0.3925016345955494),
 ('ints', 0.3763637381615694)]

## Converting the dictionary to a DataFrame

In [36]:
import pandas as pd
df = pd.DataFrame(list(dic.items()), columns=['keywords','weights'])
df = df.sort_values('weights',ascending=False).reset_index()
del df['index']
df.iloc[:10]

Unnamed: 0,keywords,weights
0,reserved,0.94025
1,comment,0.727742
2,data,0.601308
3,blank,0.5
4,page,0.5
5,intentionally,0.5
6,left,0.5
7,bit,0.451181
8,label,0.392502
9,ints,0.376364


## Exporting the dataframe to an excel sheet

In [45]:
df.to_excel('keywordsAndWeights.xlsx')