<a href="https://colab.research.google.com/github/benmccloskey/Topic_modeling/blob/main/PDF_Extract_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
pip install PyPDF2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyPDF2
  Downloading PyPDF2-2.0.0-py3-none-any.whl (88 kB)
[K     |████████████████████████████████| 88 kB 3.4 MB/s 
Installing collected packages: PyPDF2
Successfully installed PyPDF2-2.0.0


In [None]:
import PyPDF2 #PDF Parser
import re #regular-expressions
import pandas as pd
import numpy as np
from pathlib import Path
import joblib 

In [None]:
def pdf_reader(pdf_file_path):
  """
  Opens and creates a pdf reader for the desired pdf
  
  Arguments:
  pdf_file_path: The file path to the desired pdf you wish to read

  returns:
  A correctly formatted PDF as well as the number of pages.
  """
  opener = open(pdf_file_path,'rb')
  #assert opener  == True, print("Invalid PDF File Path")
  pdf_file_reader = PyPDF2.PdfFileReader(opener)
  print("PDF Successfully Read!")
  print("Number of Pages:",pdf_file_reader.numPages)
  return pdf_file_reader

In [None]:
pdf_path = '/content/drive/MyDrive/NLP/AlexNet_Paper.pdf' #Put the file path of the pdf here. Make sure it is in quotations!

test_reader = pdf_reader(pdf_path)

PDF Successfully Read!
Number of Pages: 9


In [None]:
test_reader.documentInfo.keys() #These keys the what will fill out the important information document

dict_keys(['/Subject', '/Publisher', '/Language', '/Created', '/Description-Abstract', '/Producer', '/Title', '/Date', '/ModDate', '/Published', '/Type', '/firstpage', '/Book', '/Description', '/Editors', '/Author', '/lastpage'])

In [None]:
def pdf_info(read_pdf):
  """
  Takes a formated PyPDF2 PDF and returns the pertinent information of said PDF

  ArgumentsL 
  read_pdf: A .pdf previously formatted by PyPDF2
  
  returns:
  List of all of the attributes
  """
  pdf_info_dict = {}
  pdf_info = {}
  for key,value in read_pdf.documentInfo.items():
    # print(re.sub('/',"",item) ,':',read_pdf.documentInfo[item]) #Uncommenting this code gives you the ability to see the general information of the pdf

    pdf_info_dict[re.sub('/',"",key)] = value

  return pdf_info_dict

In [None]:
def pdf_list_to_series_and_df(pdf_info_dict):
  pdf_series = pd.Series(pdf_info_dict)
  
  key_list = []
  val_list = []
  
  for key, val in pdf_info_dict.items():
    key_list.append(key)
    val_list.append(val)
  
  pdf_df = pd.Series.to_frame(pdf_series)
  pdf_df = pd.DataFrame({"Attribute" : key_list, "Information" : val_list}, index=key_list)

  filepath = Path('/content/drive/MyDrive/NLP/output.csv')  
  filepath.parent.mkdir(parents=True, exist_ok=True) 
  pdf_df.to_csv(filepath, index=False)
  return pdf_series, pdf_df

In [None]:
length = test_reader.numPages
page_1 = test_reader.getPage(0)
text = page_1.extractText()
print(text)

for i in range(length):
  text = test_reader.getPage(i)

ImageNetClassicationwithDeepConvolutional
NeuralNetworks
AlexKrizhevsky
UniversityofToronto
kriz@cs.utoronto.ca
IlyaSutskever
UniversityofToronto
ilya@cs.utoronto.ca
GeoffreyE.Hinton
UniversityofToronto
hinton@cs.utoronto.ca
Abstract
We trained a large, deep convolutional neural network to classify the 1.2 million
high-resolution images in the ImageNet LSVRC-2010 contest into the 1000 dif-
ferent classes. On the test data, we achieved top-1 and top-5 error rates of 37.5%
and 17.0% which is considerably better than the previous state-of-the-art. The
neural network, which has 60 million parameters and 650,000 neurons, consists
of ve convolutional layers, some of which are followed by max-pooling layers,
and three fully-connected layers with a nal 1000-way softmax. To make train-
ing faster, we used non-saturating neurons and a very efcient GPU implemen-
tation of the convolution operation. To reduce overtting in the fully-connected
layers we employed a recently-developed regularizat

In [None]:
def text_extraction(read_pdf):
  """
  This function will take in a PyPDF2 transformed PDF and create a dictionary of each page with its associated text
  Arguments:
  read_pdf: A PyPDF2 transformated PDf

  Returns:
  Page_Dictionary: A dictionary where the page number is the key and its item is the text contained within that page.
  """
  length = read_pdf.numPages
  page_dict = {}
  for i in range(length):
    page = read_pdf.getPage(i)
    text = page.extract_text()
    page_dict[i] = text
  return page_dict

In [None]:
def pdf_cleaner(pdf_path):
  file_reader = pdf_reader(pdf_path) #Reading and converting the desired PDF
  pdf_information = pdf_info(file_reader) # getting the information for the pdf
  pdf_info_series, pdf_info_csv = pdf_list_to_series_and_df(pdf_information) #Create a Dataframe/Series that can be viewd of the important info. 
  page_dictionary = text_extraction(file_reader) #Creating a dictionary of each of the pages and the information in text contaiend by those pages
  return file_reader, pdf_information, page_dictionary, pdf_info_csv

In [None]:
converted_pdf, info_pdf, dict_pdf, info_csv = pdf_cleaner(pdf_path)

PDF Successfully Read!
Number of Pages: 9



## The Following Block is where IP address are investigated.

In [None]:
c = 'Will this work 110.234.52.124 will hia 192.168.0.1 g 110.234.52.124 will hia 192.168.0.1 '

In [None]:
def find_ip(text):
  """
  Function that accepts a string of text and finds valid IP address.

  Parameters
  text: A string of texts

  Returns:
  All valid IP addresses with in the string of text
  """
  pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')

  ip_list = set(pattern.findall(text))

  if ip_list == []:
    print('No IP addresses found.')
    
  return ip_list

In [None]:
check = find_ip(c)
check


{'110.234.52.124', '192.168.0.1'}

## The following code searches for email addresses

In [None]:
def page_to_one_line(pdf_pages):
  """ A function that takes an extracted PDF as its input and converts each line to one page

  """
  one_liner_pdf = {}
  for i in range(len(pdf_pages)):
    text = pdf_pages[i]
    text = text.replace("\n", " ")
    
    #print(text)
    one_liner_pdf[i] = text
    # one_liner_pdf[i] = one_line_text

  return one_liner_pdf

In [None]:
check = page_to_one_line(dict_pdf)

In [None]:
def find_email(text):
  pattern = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
  email_list = pattern.findall(text)
  # if email_list == []:
  #   #print('No email addresses were found.')
  #   email_list.append('')
  return email_list

In [None]:
def print_emails(pdf_dict):

  emaillist = {}
  for i in range(len(pdf_dict)):
    emaillist[i] = find_email(pdf_dict[i])
  
  for i in list(emaillist):
    if emaillist[i] == []: #Get's rid of any pages that do not containt an email.
      del(emaillist[i])
  return emaillist

In [None]:
check = print_emails(check)
check

{0: ['kriz@cs.utoronto.ca', 'ilya@cs.utoronto.ca', 'hinton@cs.utoronto.ca']}

## Creating Code that can find all of the IP Addresses on a Page

In [None]:
def find_ip(text):
  """
  Function that accepts a string of text and finds valid IP address.

  Parameters
  text: A string of texts

  Returns:
  All valid IP addresses with in the string of text
  """
  pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')

  ip_list = set(pattern.findall(text))

  if ip_list == []:
    print('No IP addresses found.')
    
  return ip_list

In [None]:
#Convert the above code to use sets tomorrow and create a fucntion for IP addresses
def pdf_ip_address_finder(pdf_dict):

  iplist = {}
  for i in range(len(pdf_dict)):
    iplist[i] = find_ip(pdf_dict[i])
  
  for i in list(iplist):
    if iplist[i] == []: #Get's rid of any pages that do not containt an email.
      del(iplist[i])
  return iplist

In [None]:
check_dict = {0 : 'Will this work 110.234.52.124 will hia 192.168.0.1 g 110.234.52.124 will hia 192.168.0.1 '}
check = page_to_one_line(dict_pdf)
pdf_ip_address_finder(check_dict)


{0: {'110.234.52.124', '192.168.0.1'}}