In [2]:
import re
import spacy
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO

In [3]:
def extract_text_from_pdf(pdf_path):
    # Initialize the PDF resource manager
    rsrcmgr = PDFResourceManager()
    # Initialize the StringIO object to store the text
    output = StringIO()
    # Initialize the TextConverter object
    device = TextConverter(rsrcmgr, output, laparams=LAParams())
    # Initialize the PDFPageInterpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Open the PDF file
    with open(pdf_path, 'rb') as f:
        # Iterate over the pages of the PDF
        for page in PDFPage.get_pages(f):
            # Process the page
            interpreter.process_page(page)
    # Close the TextConverter object
    device.close()
    # Get the text from the StringIO object
    text = output.getvalue()
        # Lowercase the text
    text = text.lower()
    # Close the StringIO object
    output.close()
    # Return the text
    return text
text= extract_text_from_pdf("Survey_on_Device_to_Device_D2D_Communication_for_5GB.pdf")


In [4]:
 text

'received march 1, 2022, accepted march 14, 2022, date of publication march 16, 2022, date of current version march 24, 2022.\n\ndigital object identifier 10.1109/access.2022.3160215\n\nsurvey on device to device (d2d) communication\nfor 5gb/6g networks: concept, applications,\nchallenges, and future directions\n\nmohammed salih mohammed gismalla 1,2,3, (senior member, ieee),\nasrul izam azmi 1,2, (member, ieee), mohd rashidi bin salim1,2, (member, ieee),\nmohammad faiz liew abdullah 4, (senior member, ieee), farabi iqbal1,2,\nwafi a. mabrouk 4, (graduate student member, ieee),\nmaisara binti othman4, (member, ieee), adel y. i. ashyap 4,\nand abu sahmah mohd supa’at1,2\n1school of electrical engineering, faculty of engineering, universiti teknologi malaysia, skudai 81310, malaysia\n2lightwave communications research group (lcrg), innovative engineering research alliance, universiti teknologi malaysia, johor bahru 81310, malaysia\n3department of electronics and electrical engineering, f

In [5]:
def tokenizer(text):
    nlp = spacy.load('en_core_web_sm')
    # Tokenize the text into sentences and words
    doc = nlp(''.join(text))
    sentences = [sent.text for sent in doc.sents]
    words = [word.text for word in doc]
    return (sentences,words)
sentences, words = tokenizer(text)

In [6]:
sentences

['received march 1, 2022, accepted march 14, 2022, date of publication march 16, 2022, date of current version march 24, 2022.\n\n',
 'digital object identifier 10.1109/access.2022.3160215\n\nsurvey on device to device (d2d) communication\nfor 5gb/6g networks: concept, applications,\nchallenges, and future directions\n\nmohammed salih mohammed gismalla 1,2,3, (senior member, ieee),\nasrul izam azmi 1,2, (member, ieee), mohd rashidi bin salim1,2, (member, ieee),\n',
 'mohammad faiz liew abdullah 4, (senior member, ieee), farabi iqbal1,2,\nwafi a. mabrouk 4, (graduate student member, ieee),\nmaisara binti othman4, (member, ieee), adel y. i. ashyap 4,\nand abu sahmah mohd supa’at1,2\n1school of electrical engineering, faculty of engineering, universiti teknologi malaysia, skudai 81310, malaysia\n2lightwave communications research group (lcrg), innovative engineering research alliance, universiti teknologi malaysia, johor bahru 81310, malaysia\n3department of electronics and electrical eng

In [7]:
words

['received',
 'march',
 '1',
 ',',
 '2022',
 ',',
 'accepted',
 'march',
 '14',
 ',',
 '2022',
 ',',
 'date',
 'of',
 'publication',
 'march',
 '16',
 ',',
 '2022',
 ',',
 'date',
 'of',
 'current',
 'version',
 'march',
 '24',
 ',',
 '2022',
 '.',
 '\n\n',
 'digital',
 'object',
 'identifier',
 '10.1109',
 '/',
 'access.2022.3160215',
 '\n\n',
 'survey',
 'on',
 'device',
 'to',
 'device',
 '(',
 'd2d',
 ')',
 'communication',
 '\n',
 'for',
 '5gb/6',
 'g',
 'networks',
 ':',
 'concept',
 ',',
 'applications',
 ',',
 '\n',
 'challenges',
 ',',
 'and',
 'future',
 'directions',
 '\n\n',
 'mohammed',
 'salih',
 'mohammed',
 'gismalla',
 '1,2,3',
 ',',
 '(',
 'senior',
 'member',
 ',',
 'ieee',
 ')',
 ',',
 '\n',
 'asrul',
 'izam',
 'azmi',
 '1,2',
 ',',
 '(',
 'member',
 ',',
 'ieee',
 ')',
 ',',
 'mohd',
 'rashidi',
 'bin',
 'salim1,2',
 ',',
 '(',
 'member',
 ',',
 'ieee',
 ')',
 ',',
 '\n',
 'mohammad',
 'faiz',
 'liew',
 'abdullah',
 '4',
 ',',
 '(',
 'senior',
 'member',
 ',',
 'ie

In [9]:
def remove_stop_words(words):
    # Load the list of English stop words
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    # Use a regular expression to remove the stop words from the list of words
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words
filtered_words = remove_stop_words(words)


In [10]:
filtered_words

['received',
 'march',
 '1',
 ',',
 '2022',
 ',',
 'accepted',
 'march',
 '14',
 ',',
 '2022',
 ',',
 'date',
 'publication',
 'march',
 '16',
 ',',
 '2022',
 ',',
 'date',
 'current',
 'version',
 'march',
 '24',
 ',',
 '2022',
 '.',
 '\n\n',
 'digital',
 'object',
 'identifier',
 '10.1109',
 '/',
 'access.2022.3160215',
 '\n\n',
 'survey',
 'device',
 'device',
 '(',
 'd2d',
 ')',
 'communication',
 '\n',
 '5gb/6',
 'g',
 'networks',
 ':',
 'concept',
 ',',
 'applications',
 ',',
 '\n',
 'challenges',
 ',',
 'future',
 'directions',
 '\n\n',
 'mohammed',
 'salih',
 'mohammed',
 'gismalla',
 '1,2,3',
 ',',
 '(',
 'senior',
 'member',
 ',',
 'ieee',
 ')',
 ',',
 '\n',
 'asrul',
 'izam',
 'azmi',
 '1,2',
 ',',
 '(',
 'member',
 ',',
 'ieee',
 ')',
 ',',
 'mohd',
 'rashidi',
 'bin',
 'salim1,2',
 ',',
 '(',
 'member',
 ',',
 'ieee',
 ')',
 ',',
 '\n',
 'mohammad',
 'faiz',
 'liew',
 'abdullah',
 '4',
 ',',
 '(',
 'senior',
 'member',
 ',',
 'ieee',
 ')',
 ',',
 'farabi',
 'iqbal1,2',
 ',