In [1]:
import pandas as pd
import numpy as np
import pdfquery
import json
from shapely.geometry import box
from shapely.ops import cascaded_union
import pdftableextract as pte
from math import floor
from pymongo import MongoClient
import os
from os import listdir
from os.path import isfile, join, isdir
import time
import re
import pickle
import hashlib
import unidecode
from fuzzysearch import find_near_matches
import textract
import unicodedata
import ast
import psycopg2
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from configuration.configuration import ConfigClass,DbConf
import shutil
import datetime
import multiprocessing as mp
import datefinder

  """)


In [2]:
def keywordimport():
    engine = create_engine(ConfigClass.SQLALCHEMY_DATABASE_URI)

    susp_df= pd.read_sql_query('SELECT k.id,k.file_class as fileclass,k.file_type as filetype,\
                               k.keyword,k.remove_class FROM suspend_keywords k',engine)
    keyword_df= pd.read_sql_query('SELECT k.id,k.file_class as fileclass,k.file_type as filetype,k.purpose,\
                                k.decision_type,k.keyword,k.bias as bias,k.sub as sub FROM keywords k',engine)
    keyword_df['sub']=keyword_df['sub'].apply(lambda x : json.loads(x) if x!=None else [])
    keyword_df['keyword']=keyword_df['keyword'].apply(lambda x : json.loads(x))

    return keyword_df,susp_df

In [3]:
keyword_df,susp_df = keywordimport()

In [5]:
#Getting number of pages in a pdf file    
def get_pgnum(filename):
    pdf = pdfquery.PDFQuery(ConfigClass.UPLOAD_FOLDER + "/" + filename)
    pdf.load()
    pgn = len(pdf.tree.getroot().getchildren())
    return pgn

In [6]:
# Takes a dataframe of filegroups which also contains the list of files as its argument 
# and returns a dataframe files with columns, name,file group and 
# its filetype(right now its either ticket or other as the file with smallest name in a filegroup is ticket)
def get_structured_files_dataframe(df):
    df = df.groupby('group').files.apply(lambda x:pd.DataFrame(x.values[0])).reset_index().drop('level_1', axis=1)
    df.columns = ['group', 'file']
    df["ext"] = df.apply(lambda x : x.file.split(".")[-1],axis =1)
    df["length"] = df.apply(lambda x : len(x.file),axis =1)
    TI, OI = [], []
    for case, indices in list(df.groupby("group").groups.items()):
        ticket_index = df.loc[indices].length.values.argmin()
        for i, idx in enumerate(indices):
            if i == ticket_index:
                TI.append(idx)
            else:
                OI.append(idx)
    df['type'] = "OTHER"
    df.loc[TI,"type"] = "TICKET"
    return df

In [7]:
#takes a df and returns to json, the df is taken from the data extracted from the tables 
#of pdfs using pdf tables extract
def df_to_json(df):
    js = {}
    vals = df[0].values
    ls = []
    for i, v in enumerate(vals):
        if not v:
            ls.append(ls[-1])
        else:
            ls.append(v)
    df[0] = ls
    for key, idx in list(df.groupby([0]).groups.items()):
        js[key] = {}
        for val in df.loc[idx][[1,2]].values:
            if val[0] == "" and val[1] == "":
                js[key]["_value"] = ""
            elif val[0] == "" and val[1] != "":
                js[key]["_value"] = val[1]
            elif val[0] != "" and val[1] == "":
                js[key]["_value"] = val[0]
            elif val[0] != "" and val[1] != "":
                js[key][val[0]] = val[1]                
        if(key=="Documentos"):
            js[key] = df.loc[idx][[1,2]].values.tolist()
        if len(js[key]) == 1 and "_value" in js[key]:
            js[key] = js[key]["_value"]
    return js

In [8]:
#takes the path of the pdf and returns its text using pdf query the text extracted will be sorted 
#accoring to its y cooridinates of its bounding boxes
def get_pdf_text(path):
    try:
        pdf=pdfquery.PDFQuery(path)
        pdf.load()
        pdftext=""
        nodelis=[]
        pgn=len(pdf.tree.getroot().getchildren())
        for i in range(0,pgn):
            root = pdf.tree.getroot().getchildren()[i]
            npg=[]
            for node in root.iter():
                try:
                    if node.text and float(node.get("y0"))>50 and float(node.get("x0"))>50:
                        npg.append(node)
                except Exception as e:
                    print((node.tag, e))
            sorted_npg=sorted(npg, key=lambda x: float(x.get("y0")),reverse=True)
            if len(sorted_npg)>0:
                prev_y=sorted_npg[0].get("y0")
                for x in sorted_npg: 
                    esc="\n"
                    if x.get("y0")==prev_y:
                        esc="|"
                    prev_y=x.get("y0")
                    pdftext+=esc+x.text
        return pdftext
    except Exception as e:
        return('Error:'+str(e))

In [9]:
#takes rtf files path and returns its text    
def get_rtf_text(path):
    text = os.popen('unrtf --text '+path).read()
    return text

In [10]:
#takes the path of a pdf and extract table 1 and table 2 of tickets and returns its json 
def parse_ticket(pdf_path):
    _JSON = {"filepath" : pdf_path}
    try:
        pdf = pdfquery.PDFQuery(pdf_path)
        pdf.load()
        root = pdf.tree.getroot().getchildren()[0]
        page_box = [float(x) for x in root.get("bbox")[1:-1].split(",")]
        tables, _ =\
        list(zip(
            *sorted(
                [(p.bounds,p.area) for p in cascaded_union(
                    [box(*[float(x) for x in node.get("bbox")[1:-1].split(",")]) for node in root.iter() if node.tag == "LTRect"]
                )],
                key = lambda x : -x[1]
            )
        ))
        X = page_box[2]
        Y = page_box[3]
        xf = 11.69/X
        yf = 8.27/Y
        t1, t2 = tables

        table_1_bbox = ":".join(map(str,(t1[0]*xf - 0.1, (Y - t1[3])*yf - 0.1, t1[2]*xf + 0.1, (Y - t1[1])*yf + 0.1)))
        table_2_bbox = ":".join(map(str,(t2[0]*xf - 0.1, (Y - t2[3])*yf - 0.1, t2[2]*xf + 0.1, (Y - t2[1])*yf + 0.1)))

        df1 =\
        pd.DataFrame(
            pte.table_to_list(
                pte.process_page(
                    pdf_path,
                    "1",
                    crop = table_1_bbox,
                    pad=20
                ),
                "1"
            )[1]
        )
        _JSON["table_1"] = df_to_json(df1)
        df2 = \
        pd.DataFrame(
            pte.table_to_list(
                pte.process_page(
                    pdf_path,
                    "1",
                    crop = table_2_bbox,
                    pad=20
                ),
                "1"
            )[1]
        )
        df2.columns = df2.iloc[0]
        df2 = df2.reindex(df2.index.drop(0))
        _JSON["table_2"] = df2.to_json(orient='index')
#         print(_JSON)
    except Exception as e:
            return('Error:'+str(e))
    return json.dumps(_JSON, ensure_ascii=False)


In [11]:
#takes a path of a file(pdf and rtf) and extract its texts and remove its accents of spansish characters
def parse_other(pf):
    text=""

    try:
        if pf[-3:].lower()!='pdf':
            newtext =textract.process(pf)
        else:
            newtext=get_pdf_text(pf)
        if(len(newtext.split())==0):
            newtext =textract.process(pf,method='tesseract')
        try:
            newtext=(''.join((c for c in unicodedata.normalize('NFD', newtext) if unicodedata.category(c) != 'Mn'))).lower()
        except Exception as e:
            newtext=(''.join((c for c in unicodedata.normalize('NFD', newtext.decode("utf-8")) if unicodedata.category(c) != 'Mn'))).lower()
        if pf[-3:].lower()=='rtf':
            print("rtf ----------->",newtext)
        rem=''
        paratlist=['MODO DE IMPUGNACION:'.lower(),'mode d\'impugnacio',
                   'recurso de repelacion','recurs de reposicio','recurso de reposicion',
                   'recurso de apelacion']
        for parat in paratlist:
            if (parat.lower() in newtext) :
                rem=newtext.split(parat.lower())[-1]
        newtext=newtext.replace(rem,'')
    except Exception as e:
        newtext='Error:'+str(e)
    return newtext

In [12]:
#takes file data frame and returns its table response(table json) and text response
def parsefile(fdf,PDF_DIR,co):
    for i, r in fdf.iterrows():
        ticresponse=""
        textresponse=""
        try:
            ticresponse= parse_ticket(join(PDF_DIR,r.filename))
        except Exception as e:
            ticresponse='Error:'+str(e)
        try:
            if r.filename[-3:].lower()!='zip':
                if r.filetype=='TICKET':
                        textresponse=ticresponse
                else:
                    textresponse=parse_other(join(PDF_DIR,r.filename))
                    if 'Error:' in textresponse:
                        print("text",i,textresponse)
            else:
                textresponse=""
                with zipfile.ZipFile(join(PDF_DIR,r.filename)) as z:
                    for fileinzip in z.namelist():

                        if not os.path.isdir(fileinzip):
                            # read the file
                            zfdir=join(PDF_DIR, os.path.basename(fileinzip))
                            with z.open(fileinzip) as fz,open(zfdir, 'wb') as zfp:
                                        shutil.copyfileobj(fz, zfp)
                                        text=parse_other(join(PDF_DIR,zfdir))
                                        if text[:5]!='Error':
                                            textresponse+=text
                                        os.remove(zfdir)
        except Exception as e:
            textresponse='Error:'+str(e)
        if (r.filename[-3:].lower()=='rtf'):
            print("___________________________________________________")
            print(i,textresponse)

        fdf.loc[i,"table_response"] = ticresponse
        fdf.loc[i,"text_response"] = textresponse
    return fdf

In [13]:
# Once the table json is extracted we know which is the principal notification file and moves on to update it
def update_filetype(fdf):
    fgs=fdf.groupby('filegroup')
    fgdf=pd.DataFrame(columns=['filegroup'])
    i=0
    fgdf['files']=np.empty((len(fgs.groups), 0)).tolist()
    fgdf['filetypes']=np.empty((len(fgs.groups), 0)).tolist()
    for k,v in list(fgs.groups.items()):
        fgdf.loc[i,'filegroup']=k
        files=[]
        pcs=[]
        filetypes=[]
        for ind in v:
                files.append(fdf.loc[ind,'filename'])
                filetypes.append(fdf.loc[ind,'filetype'])
        fgdf.loc[i,'filetypes']=filetypes
        fgdf.loc[i,'files']=files
        i+=1
    for i ,r in fdf[~(fdf['table_response'].str.contains('Error:'))&(fdf['filetype']=='TICKET')].iterrows():
        if r['table_response'][:5]!='Error':
            js=json.loads(r['table_response'])
            pf=r['filename'].split('.')[0]+'_'+''.join(js['table_1']['Documentos'][0][0].split()).split('(Principal)')[0]
            fgf=[''.join(x.split())for x in fgdf.loc[fgdf['filegroup']==r['filegroup'],'files'].values[0] ]
            if pf in fgf:
                fl=fgdf.loc[fgdf['filegroup']==r['filegroup'],'files'].values[0][fgf.index(pf)]
                fdf.loc[fdf['filename'].str.contains(fl),'filetype']="NOTIFICATION"
    return fdf,fgdf


In [15]:
# A recursive function used for classification based on the hierarchy of the keywords 
#and and its occurence in a particuar file
def get_predclass_normal(kdf,row,j,text,fdf):
    bias=row['bias']
    if len(row['sub'])>0:
        for i, kr in kdf[(kdf['id'].isin(row['sub']))].iterrows():
            f=True
            for k in kr['keyword']:
                if not (''.join(unidecode.unidecode(k).split()).lower() in text):
                    f=False
            if f:
                bias1=get_predclass_normal(kdf,kr,j,text,fdf)
                bias=bias1
                if kr['fileclass'] in list(fdf.loc[j,'keywords'].keys()):
                    fdf.loc[j,'keywords'][kr['fileclass']].append(kr['keyword'])
                else:
                    fdf.loc[j,'keywords'][kr['fileclass']]=list()
                    fdf.loc[j,'keywords'][kr['fileclass']].append(kr['keyword'])
    return bias

In [16]:
# A recursive function used for classification based on the hierarchy of the keywords and and its occurence in a particuar file through fuzzy search of distance 1
def get_predclass_fuzzy(kdf,row,j,text,fdf):
    bias=row['bias']
    if len(row['sub'])>0:
        for i, kr in kdf[(kdf['id'].isin(row['sub']))].iterrows():
            f=True
            for k in kr['keyword']:
                try:
                    match=find_near_matches(''.join(unidecode.unidecode(k).split()).lower(), text, max_l_dist=1)
                except Exception as e:
                    f=False
                if len(match)==0:
                    f=False
            if f:
                bias1=get_predclass_normal(kdf,kr,j,text,fdf)
                bias=bias1
                if kr['fileclass'] in list(fdf.loc[j,'keywords'].keys()):
                    fdf.loc[j,'keywords'][kr['fileclass']].append(kr['keyword'])
                else:
                    fdf.loc[j,'keywords'][kr['fileclass']]=list()
                    fdf.loc[j,'keywords'][kr['fileclass']].append(kr['keyword'])
    return bias


In [17]:
#File classification based on notification bibles   
def get_classify_result(fdf,kdf,suspkdf,notification_corelation_dict):
    kdf["document_list"]=np.empty((len(kdf), 0)).tolist()
    fdf["keywords"]=fdf["filename"].apply(lambda x:{})
    fdf["pred_class"]=fdf["filename"].apply(lambda x:list())
    fdf["remove_class"]=fdf["filename"].apply(lambda x:list())
    fdf["after_classfn"]=fdf["filename"].apply(lambda x:list())
    fdf["final_categ"]=fdf["filename"].apply(lambda x:list())
    fdf.loc[fdf['filetype']=='TICKET','text_response']=fdf['table_response']
    NX_filename_N1_N5=['ICO','S3A','S05','S02','S02','S5L','S04','S01','CNA','S5C','PCO','ASE','S1C']
    classlis=[]
    for j,row in fdf.iterrows():
        if row['filetype']=='TICKET':
            text=''.join(unidecode.unidecode((row['text_response'])).split()).lower()
        else:
            text=''.join(row['text_response'].split())
        paratlist=['MODO DE IMPUGNACION:'.lower(),'mode d\'impugnacio',
                   'recurso de repelacion','recurs de reposicio','recurso de reposicion','recurso de apelacion']
        rem=''
        for parat in paratlist:
            if (parat.lower() in text) :
                rem=text.split(parat.lower())[-1]
        text=text.replace(rem,'')
#         text=text.split("impugnacion")[0]



### write logic for split header and body



        for i,kdrow in kdf.iterrows():
#         #       
            if text[:5]!='Error':

                f=True

                for k in kdrow['keyword']:
                    if not (''.join(unidecode.unidecode(k).split()).lower() in text):
                        f=False
                if f and (kdrow['filetype']==row['filetype'] or(kdrow['filetype']=='NOTIFICATION' and row['filetype']=='OTHER') )and (kdrow['purpose']=='CLASSIFICATION'):
                    fdf.loc[j,'pred_class'].append(get_predclass_normal(kdf,kdrow,j,text,fdf))
                    if kdrow['fileclass'] in fdf.loc[j,'keywords'].keys():
                        fdf.loc[j,'keywords'][kdrow['fileclass']].append(kdrow['keyword'])
                    else:
                        fdf.loc[j,'keywords'][kdrow['fileclass']]=list()
                        fdf.loc[j,'keywords'][kdrow['fileclass']].append(kdrow['keyword'])

        if not bool(fdf.loc[j,'keywords']):
            for i,kdrow in kdf.iterrows():
                if text[:5]!='Error':

                    f=False
                    if(kdrow['filetype']==row['filetype'] or(kdrow['filetype']=='NOTIFICATION' and row['filetype']=='OTHER') )and (kdrow['purpose']=='CLASSIFICATION'):
                        f=True
                        for k in kdrow['keyword']:
                            match=[]
                            try:
                                match=find_near_matches(''.join(unidecode.unidecode(k).split()).lower(), text, max_l_dist=1)
                            except Exception as e:
                                f=False
                            if len(match)==0:
                                f=False

                    if f:
                        fdf.loc[j,'pred_class'].append(get_predclass_fuzzy(kdf,kdrow,j,text,fdf))
                        if kdrow['fileclass'] in fdf.loc[j,'keywords'].keys():
                            fdf.loc[j,'keywords'][kdrow['fileclass']].append(kdrow['keyword'])
                        else:
                            fdf.loc[j,'keywords'][kdrow['fileclass']]=list()
                            fdf.loc[j,'keywords'][kdrow['fileclass']].append(kdrow['keyword'])
        fdf.set_value(j,'pred_class',list(set(fdf.loc[j,'pred_class'])))

        for si,sr in suspkdf.iterrows():
                f=True
                for sk in sr['keyword']:

                    if not (''.join(unidecode.unidecode(sk).split()).lower() in ''.join(row['text_response'].split() )):
                        f=False
                if f and (row['filetype']=='NOTIFICATION' or row['filetype']=='OTHER'):
#                     if sr['remove_class'] in fdf.loc[j,'pred_class']:
                        if  not ((sr['remove_class']=='N2' or sr['remove_class']=='N12') and (''.join(("SE ALZA LA SUSPENSION DE LAS ACTUACIONES").split()).lower() in text)) :

                            if 'NX-'+sr['remove_class'] in fdf.loc[j,'keywords'].keys():
                                fdf.loc[j,'keywords']['NX-'+sr['remove_class']].append(sr['keyword'])
                            else:
                                fdf.loc[j,'keywords']['NX-'+sr['remove_class']]=list()
                                fdf.loc[j,'keywords']['NX-'+sr['remove_class']].append(sr['keyword'])
                            fdf.loc[j,'remove_class'].append(sr['remove_class'])
        f=True
        fdf.set_value(j,'remove_class',list(set(fdf.loc[j,'remove_class'])))
        if not 'N-ALL' in fdf.loc[j,'remove_class']:
            for cl in list(set(fdf.loc[j,'pred_class'])-set(fdf.loc[j,'remove_class'])):
                fdf.loc[j,'after_classfn'].append(cl)
        for cla in list(fdf.loc[j,'after_classfn'] ): 
            s=set(notification_corelation_dict[cla])
            if not(set(row['after_classfn'])<=(s) and set([cla])<=set(row['after_classfn'])):
                f=False
        if f:
            fdf.set_value(j,'final_categ',list(set(fdf.loc[j,'after_classfn'])))
        if 'N1' in fdf.loc[j,'final_categ'] or 'N5' in fdf.loc[j,'final_categ']:
            fl=False
            for flnx in NX_filename_N1_N5:
                if flnx in fdf.loc[j,'filename'].split('_')[2]:
                    fl=True
            if fl:
                if 'N1' in fdf.loc[j,'final_categ']:
                    fdf.loc[j,'final_categ'].remove('N1')
                elif 'N5' in fdf.loc[j,'final_categ']:
                    fdf.loc[j,'final_categ'].remove('N5')

## lauren special case
    fgs=fdf.groupby('filegroup')
    for k,v in fgs.groups.items():
        f=False
        otherclass="N16"
        for ind in v:
            if 'N16' in fdf.loc[ind,'final_categ']:
                f=True
            if 'N8' in fdf.loc[ind,'final_categ']:
                otherclass='N8'
            elif 'N15' in fdf.loc[ind,'final_categ']:
                otherclass='N15'
        for ind in v:
            if f:
                for n, i in enumerate(fdf.loc[ind,'final_categ']):
                    if i == 'N16':
                        fdf.loc[ind,'final_categ'][n] = otherclass


    return fdf

In [18]:
def read_pdf_n_insert(root_new):
    PDF_DIR = '/home/thrymr/Racmo/Testfiles'
    pdf_files = [f for f in listdir(PDF_DIR)\
             if isfile(join(PDF_DIR,  f)) ]
    
    if len(pdf_files)>0:
        df = pd.DataFrame(pdf_file.split('_') for pdf_file in pdf_files)
        df = df[pd.notnull(df[3])]
        file_groups = df.groupby(3).groups
        grouped_files = list()
        for file_group, idx in file_groups.items():
            filenames = ['_'.join((df.iloc[ind]).dropna()) for ind in idx]
            extensions = [fn[-3:] for fn in filenames]
            fgroup={'group':file_group, 'files':filenames, 'length':len(filenames), 
                    'min_filename':min(filenames, key=len),'extensions':extensions}
            grouped_files.append(fgroup)
        flgdf=pd.DataFrame(grouped_files)
        flgdf=flgdf.dropna(thresh=1,axis=1)
        fdf = get_structured_files_dataframe(flgdf)
        fdf=fdf.rename(columns={"file":"filename","group":"filegroup","type":"filetype"})
        print("parsing")
        pool = mp.Pool(processes=4)
        ind=int()
        n=len(fdf)
        results = [pool.apply(parsefile, args=(fdf[int(x*len(fdf)/n):int((x+1)*len(fdf)/n)],PDF_DIR,x,)) for x in range(0,n)]
        fdf=pd.concat(results)
        fdf,fgdf=update_filetype(fdf)
        notification_corelation_dict = { 'N1' : {'N1','N4','N7','N11','N13'},
                   'N2' : {'N2','N4','N7','N8','N11','N13','N15','N16'},
                   'N3' : {'N3','N4','N7','N8','N11','N13','N15','N16'},
                   'N4' : {'N1','N2','N3','N4','N7','N8','N9','N10','N13','N14','N15','N16'},
                   'N5' : {'N5'},
                   'N6' : {'N6'},
                   'N7' : {'N1','N2','N3','N4','N7','N11'},
                   'N8' : {'N2','N3','N4','N8','N9','N10','N11'},
                   'N9' : {'N4','N8','N9','N10','N11','N13','N15','N16'},
                   'N10' : {'N4','N8','N9','N10','N11','N12','N13','N15','N16'},
                   'N11' : {'N1','N2','N3','N7','N8','N9','N10','N11','N13','N14','N15','N15'},
                   'N12' : {'N12'},
                   'N13' : {'N1','N2','N3','N4','N9','N10','N11','N13'},
                   'N14' : {'N4','N11','N14'},
                   'N15' : {'N2','N3','N4','N9','N10','N11','N15'},
                   'N16' : {'N2','N3','N4','N9','N10','N11','N16'}
        } 
        kdf,suspkdf = keywordimport()
        print("classification")
        fdf = get_classify_result(fdf,kdf,suspkdf,notification_corelation_dict)
        return fdf

In [19]:
%%time
fdf= read_pdf_n_insert('/home/thrymr/Racmo/Testfiles')

parsing
('rtf ----------->', u'juzgado de primera instancia numero 1 de cordoba\ncalle isla de mallorca, s/n. modulo b, 4\xaa planta.\ntlf.: 957 745044/43. fax: 957 002358\nemail: \nn.i.g.: 1402142c20080013827\nprocedimiento: ejecucion de titulos no judiciales 1435/2008. negociado: b\nsobre: reclamacion de cantidad \nde: caixanova y eos spain s.l.\nprocurador/a: sr/a. encarnacion villen perez y silvia malagon loyo\nletrado: sr/a. maria jose cosmea rodriguez\ncontra: actividades construccion y servicios andaluces, s.l.\nprocurador/a: sr/a. \nletrado: sr/a. francisco de asis moreno cordoba\n\ndiligencia de ordenacion del letrado/a de la administracion de justicia sr/a.d/d\xaa francisco jose aguilar osuna\n\nen cordoba, a siete de mayo de dos mil dieciocho.\n\n\nel anterior escrito presentado por la administracion concursarl de actividades construccion y servicios andaluces, s.l., unase a los autos de su razon y dese traslado a las partes personadas a los efectos procedentes.\n        \ns

  _warn_if_not_unicode(string)


CPU times: user 29.1 s, sys: 300 ms, total: 29.4 s
Wall time: 5min 44s
