In [1]:
from tika import parser
import pandas as pd
import os
import json
import glob
import spacy
import re
import textstat

nlp = spacy.load('en')
JSON_path = os.path.join(".", "JSON")

class PDFProcessor:
    def __init__(self, file):
        parsedPDF = parser.from_file(file)
        self.doc = nlp(parsedPDF["content"])
        
    def process(self, dest):
        data = {
        "Flesch reading ease" : self._flesch_ease(),
        "Number of PCM mentions" : self._find_pcm(),
        "List of kWh numerical data" : self._find_kwh()
        }
        
        with open(dest, 'w') as file:
            file.write(json.dumps(data))
            
        return data
    
    def _find_kwh(self):
        regex_kwh = "\d*\.?\d+?\s?kwh"
        list_kwh = re.findall(regex_kwh, str(self.doc), re.IGNORECASE)
        
        return list_kwh
    
    def _find_pcm(self):
        words = list(filter(lambda w: not w.is_stop, self.doc))
        num_pcm = re.findall('pcm', str(words), re.IGNORECASE)
    
        return len(num_pcm)

    def _flesch_ease(self):
        
        return textstat.flesch_reading_ease(str(self.doc))
    
def process_file(file):
    processor = PDFProcessor(file)
    if not os.path.isdir(JSON_path):
        os.makedirs(JSON_path)
    data = processor.process("./JSON/" + file + ".json")
    
    return data
    
files = glob.glob("*.pdf")
data = list(map(process_file, files))

In [2]:
dF = pd.DataFrame(data=data)
dF

Unnamed: 0,Flesch reading ease,List of kWh numerical data,Number of PCM mentions
0,46.00,[],0
1,47.93,[],0
2,44.07,"[2.8 kWh, 5.4 kWh, 7.5 kWh, 4.2\nkWh, 5.47 kWh]",232
3,49.38,[],0
4,40.28,"[6.8 kWh, 6.9 kWh, 10.9 kWh, 15 kWh]",131
5,52.43,[],0
6,46.00,"[0.693 kWh, 0.591 kWh, 0.693 kWh, 0.5 kWh, 0.5...",3
7,42.85,"[10113.6 kWh, 7080 kWh, 3033.6 kWh, 10113.6 kW...",0
8,40.31,[],0
9,13.51,[],0
