In [None]:
import configparser 
import datetime 
import os 
import pickle
import time 
from io import StringIO 
from collections import defaultdict
import json
import docx
import flask
import flask_restplus
import gensim.summarization
import matplotlib.pyplot as plt
import numpy as np
import spacy
import torch
from OpenSSL import SSL
from PIL import Image
from benfordslaw import benfordslaw
from bs4 import BeautifulSoup
from commonregex import CommonRegex
from flask import Flask, request, jsonify
from flask_cors import CORS
from flask_restplus import Resource, Api 
from gensim.summarization.summarizer import summarize
from langdetect import detect_langs, DetectorFactory 
from nltk.tokenize import sent_tokenize
from pandas import read_csv
from profanity_check import predict 
from pyod.models.knn import KNN
from rake_nltk import Rake 
from sklearn import svm 
from sklearn.ensemble import IsolationForest
from tika import parser, tika 
from transformers import T5Tokenizer, T5ForConditionalGeneration 
from werkzeug.datastructures import FileStorage
from wordcloud import WordCloud, STOPWORDS,ImageColorGenerator

flask_restplus.apidoc.apidoc.static_url_path = "/static"

@Flask_restplus.apidoc.apidoc.add_app_template_global
def swagger_static(filename):
    return flask.url_for ("restplus_doc static", filename="swagger-vi/dist/{0}".format(filename))

igad_application = Flask(_name_)
igad_application.config.SWAGGER_UI_DOC_EXPANSION = "list"
CORS(igad_application)
api = Api(igad_application, version="1.0", title="Document Analyzer", description="API")

def sentence_finder(text, word_list):
        sentences = sent_tokenize(text)
        mpa = dict.fromkeys(range (32))
        return " ".join([sent.translate(mpa) for sent in sentences if any(word in sent.lower() for word in word_list)])
    
upload_parser = api.parser()
upload_parser.add_argument('file', location='files', required=True, type=Filestorage, help='Upload the PDF file') 
upload_parser.add_argument('Task', types='string', required=True, choices=["Summarization", "Topic", "Question"], help="Type of task. If Task is QnA, Quesion is required. If Task is Topic Extraction, Topic is required.")
upload_parser.add_argument('question', type='string', required=False, help='Question')
upload_parser.add_argument('Topic', type='string', required=False,choices=["PEP", "Terrorist Financing", "AML"], help="Topic for which info to be extracted")

@api.route('/api/v1/DocAnalyzer')
@api.response(500, 'An unexpected error occurred.')
class IGAD_Document_Auditor(Resource):
    @api.expect(upload_parser)
    def post(self):
        config = configparser.ConfigParser()
        config.read('config.ini')
        basePath = config['DEFAULT']['basePath']
        #tika.TikaJarPath = basePath + 'tika'
        uploaded_file = request.files['file'] 
        question = request.args.get('question')
        print (' question::', (str(question)))
        
        task = request.args.get('Task')
        topic = request.args.get ('Topic')
        
        st=datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
        start = time.time()
        print ('*******Execution started::',st)
        
        pos = uploaded_file.filename.rfind(",")
        final_filename = uploaded_file.filename[:pos].replace(" ", "")+ "_" + st
        
        UPLOAD_FOLDER = basePath + 'input/'
        uploaded_file.save(os.path.join(UPLOAD_FOLDER, final_filename + uploaded_file.filename[pos:]))
        parsedPDF = parser.from_file(os-path.join(UPLOAD_FOLDER, final_filename + uploaded_file.filename[pos:]))
        pdf_content = str(parsedPDF["content"].encode('ascii', errors='ignore'))
        preprocess_text = pdf _content.strip().replace("\\n","") 

        if task == "Summarization":
            if int(parsedPDF["metadata"]['xmpTPg:NPages']) < 40:
                tokenizer = T5Tokenizer.from_pretrained(basePath + 'models/t5_small')
                model = T5ForConditionalGeneration.from_pretrained(basePath + 'models/t5_small')
                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                t5_prepared_Text = "summarize:" + preprocess_text

                extractive_summary = summarize(preprocess_text, word_count=200)

                tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)
                summary_ids = model.generate(tokenized_text, num_beans=4, no_repeat_ngram_size=2, min_length=100, max_length=400, early_stopping=False)
                abstractive_summary_t5 = str(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
            
                if (len(list(map(lambda x: x.strip().capitalize(), abstractive_summary_t5.split('.')))[-1]) < 40):
                    abstractive_summary = str('. '.join(
                    list(map(lambda x: x.strip().capitalize(), abstractive_summary_t5.split('.')))[:-1]))+ '.'
                else:
                    abstractive_summary= '. '.join(
                    list(map(lambda x: x.strip().capitalize(), abstractive_summary_t5.split('.'))))

                file _data_sent = {'Abstractive Summary': str(abstractive_summary),
                           'Extractive Summary' : str(extractive_summary)}
        
            else:
                if (len(preprocess_text) > 100000):
                    compress_ratio = 0.006
                elif(len(preprocess_text) ≤ 1000):
                    compress_ratio = 0.2
                else:
                    compress_ratio = 0.05

                extractive_summary = summarize(preprocess_text, word_count=300)
                file_data_sent = {'Extractive Summary': str(extractive_summary)}

        elif (task == "Topic"):
            if topic == "PEP":
                word_list = ['pep', 'politically exposed person', 'political']
                topic_info = sentence_finder(preprocess_text, word_list)
            elif topic == "Terrorist Financing":
                word_list = ['terrorist financing','counter-terrorist financing', 'financing of terrorism','terrorism financing','terror funding']
                topic_info = sentence_finder(preprocess_text, word_list)
            else:
                word_list = ['aml','anti-money laundering', 'anti money laundering']
                topic_info = sentence_finder(preprocess_text, word_list)

            file_data_sent = {'Topic Information': str(topic_info)}


        else:
            if (int(parsedPDF["metadata"]['xmpTPg:NPages']) < 40):

                tokenizer=T5Tokenizer.from_pretrained(basePath + 'models/t5_small)
                model = T5ForConditionalGeneration.from_pretrained(basePath + 'models/t5_small')

                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

                t5_prepared_Text= "question:" + question + "context:" + preprocess.text
                tokenized_ text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)
                summary_ids = model.generate(tokenized_text,num_beams=4, no_repeat_ngram_size=2, min_length=50,max_ length=300, early_stopping=False)
                answer = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

                file_data_sent= {('Answer':str(answer)}
            else:
                file_data_sent = {'Answer': str('Document too big to process!!!')}

                      
        st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
        print（'************** Execution completed::', st)
        print（'************** Total Execution completed::', time.time()- start)
        final_filename = final_filename + ".docx"

        return jsonify(file_data_sent)
I      
doc_classifier_parser = api.parser()
doc_classifier_parser.add_argument('file', location='files', required-True, type=FileStorage, help='Upload the PDF file')

@api.route('/api/v1/DocClassifier')
@api.response(500,'An unexpected error occured.')
class IGAD_DocClassifier(Resource):
      @api.expect(doc_classifier_parser)
      def post(self):
            config=configparser.ConfigParser()
            config.read('config.ini')
            basePath = config['DEFAULT']["basePath']
            #tika.TikaJarPath = basePath + 'tika'

            uploaded_file = request.files['file'] # This is FileStorage instance
            st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
            start = time.time()
            print（'*************** Execution started::'， st）
            pos = uploaded_file.filename.rfind(".")
                                         
            final_filename = uploaded_file,filename[:pos].replace(" ", "_") + "_" + st
            UPLOAD_FOLDER = basePath + 'input/'
            uploaded_file.save(os.path.join(UPLOAD_FOLDER, final_filename + uploaded_file.filename[pos:]))
            parsedPDF = parser.from_file(os.path.join(UPLOAD_FOLDER, final_filename + uploaded_file.filename[pos:]))
            metadata = parsedPDF ["metadata"]
            pdf_content = str(parsedPDF["content"].encode('ascii', errors='ignore'))
            pages_txt = []
            PDF_pages = parser.from_file(os.path.join(UPLOAD_FOLDER, final_filename + uploaded_file.filename[pos:]), xmlContent=True)
            xhtml_data = BeautifulSoup(PDF_pages['content'])
            for i, content in enumerate(xhtml_data.find_all('div', attrs=('class': 'page'})):
                _buffer = StringIO()
                _buffer.write(str(content))
                parsed_content = parser.from_buffer(_buffer.getvalue())
                text = parsed_content['content'].strip()
                pages_txt.append(str(text.encode('ascii', errors='ignore')).strip().replace("\\n", ""))

            preprocess_text = pdf_content.strip().replace("\\n","")
            DetectorFactory.seed = 2
            lang_detected =str(detect_langs(preprocess_text))
            r = Rake()
            r.extract_keywords_from_text(preprocess_text)
            keyphrases = r.get_ranked_phrases()[0:5]
            keywords = str(gensim.summarization.keywords(preprocess_text, words=20, lemmatize=True).split('\n'))
            nlp = spacy.load("en_core_web_sm")
            nlp.max_length = 4000000
            spacy_doc= nlp(preprocess_text)                                    
            ner_lst =[]
            for ent in spacy_doc.ents:
                if (ent.label_in ['PERSON', 'ORG', 'GPE', 'EVENT', 'LAW', 'MONEY']):
                        ner_lst.append((ent.text, ent.label_))

            ner_data = list(set(ner_lst))

            ner_Output = {}
            for x, y in ner_data:
                if y in ner_Output:
                    ner_Output[y].append((x))
                else:
                    ner_Output[y] =[(x)]
            I
            imp_words = pages_txt[0] + ' ' + uploaded_file.filename[:pos]
            imp_spacy_doc = nlp(imp_words)

            imp_ner_lst = []
            for ent in imp_spacy_doc.ents:
                if(ent.label.in ['PERSON', 'NORP','FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW','DATE','MONEY']):
                    imp_ner_lst.append((ent.text, ent.label_))

            imp_ner_data = list(set(imp_ner_lst))

            imp_ner_Output = {}
            for x,y in imp_ner_data:
                if y in imp_ner_Output:
                    imp_ner_Output[y].append((x))
                else:
                    imp_ner_Output[y]= [(x)]
                            
            clf = pickle.load(open(basePath + 'docs/pyss3/pyss3_v2.pkl', 'rb'))
            class_pred = clf.classify_label(preprocess_text)
            c_pred = clf.classify(preprocess_text,sort=True)
                            
            for index,item in enumerate(c_pred):
                Itemlist = list(item)
                if itemlist[0] == 0:
                    itemlist[0] = 'ABC'
                elif itemlist[0] == 1:
                    itemlist [0] = 'AML'
                elif itemlist[0] == 2:
                    itemlist [0] = 'EMBARGO'
                elif itemlist[0] == 3:
                    itemlist[0] = 'GDPR'
                elif itemlist[0] == 4:
                    itemlist[0] = 'KYC'
                elif itemlist[0] == 5:
                    itemlist[0] = 'unknown'

                item= tuple(itemlist)
                c_pred[index] = item

            stopwords = set(STOPWORDS)
            stopwords.update(["should", "now", "will", "include", "may be", "ha", "Where"])

            mydoc = docx.Document()
            mydoc.add_heading('IGAD Document Analyzer', 0)

            mydoc.add_paragraph('Analysis for document ::' + str(uploaded_file.filename)) 
            mydoc.add_paragraph('File character length ::' + str(len(preprocess_text))) 
            mydoc.add_paragraph('Number of pages ::' + str(parsedPDF["metadata"]['xmpTPg:NPages']))
            if 'Creation-Date' in parsedPDF["metadata"]:
                    mydoc.add_paragraph('Creation Date::' + str(parsedPDF["metadata"]['Creation-Date']))
            if 'CreationDate' in parsedPDF["metadata"]:
                    mydoc.add_paragraph('Creation Date::' + str(parsedPDF["metadata"]["CreationDate"]))
            if 'Author' in parsedPDF["metadata"]:
                    mydoc.add_paragraph('Author ::' + str(parsedPDF["metadata"]['Author']))
            mydoc.add_paragraph('Language detected ::' + str(lang_detected)) 
            mydoc.add_paragraph('Document Class detected::' + str(class_pred))
            mydoc.add_paragraph('Document Class Confidence Values ::' + str(c_pred)) 

            mydoc.add_heading('File metadata::', level=2)
            mydoc.add_paragraph(str(metadata))

            mydoc.add_heading('Key Phrases ::', level=2)
            mydoc.add_paragraph(str(keyphrases))

            mydoc.add_heading('Keywords::', level=2)
            mydoc.add_paragraph(str(keywords))

            mydoc.add_heading('Important Extracted Entities ::', level=2) 
            mydoc.add_paragraph(str(imp_ner_Output))

            mydoc.add_heading('All Extracted Entities ::', level=2)
            mydoc.add_paragraph(str(ner_Output))

            file_data_sent = {'Input file': str(uploaded_file.filename), 'Metadata': str(metadata),'Document Class': str(class_pred), 'Document Class Confidence Values': str(c_pred),'Word Cloud': str(wordcloud_sg.words_), 'Document length': len(preprocess_text),'Language detected': str(lang_detected),'Number of pages': str(parsedPDF["metadata"]['xmpTPg:NPages']),'Keywords': str(keywords), 'Important Extracted Entities': str(imp_ner_Output),'All Named Entities': str(ner_Output), 'Key Phrases': str(keyphrases)}
            mydoc.save(os.path.join(UPLOAD_FOLDER, final_filename + ".docx"))

            st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
            print（'************Execution completed ::'，st）
            print('*************Total execution time::', time.time() -start)
            final_filename = final_filename + ".docx"

            return jsonify(file_data_sent)

compliance_parser = api.parser()
compliance_parser.add_argument('Message', type='string', required=True, help='Enter data to be analysed')

@api.route('/api/v1/Compliance')
@api.response(500,'An unexpected error occured, Please contact administrator.')
class IGAD_Compliance(Resource):
    @api.expect(compliance_parser)
    def post(self):
        message = request.args.get('Message')

        parsed_text = CommonRegex(message)
        pii_detected = defaultdict(list)
        if(len(parsed_text.emails) > 0):
            pii_detected['Email'].append(str(parsed_text.emails))
        if(len(parsed_text.ipv6s)>0 | len(parsed_texts.ips)>0):
            pii_detected['IP'].append(str(parsed_text.ips))
        if(len(parsed_text.street_addresses) › 0):
            pii_detected['Address'].append(str(parsed_text.street_addresses))
        if (len(parsed_text.phones) > 0):
            pii_detected['Phones'].append(str(parsed_text.phones))

        nlp = spacy.load("en_core_web_sm")
        nlp.max_length= 4000000
        spacy_doc = nlp(message)

        ner_person = []
        ner_org = []
        ner_date = []
        ner_money = []
        ner_norp = []
        ner_gpe = []

        for ent in spacy_doc.ents:
            if(ent.label_ == 'PERSON'):
                ner_person.append(ent.text)
            if(ent.label_ == 'ORG'): 
                ner_org-append(ent.text)
            if(ent.label == 'DATE'):
                ner_date.append(ent.text)
            if(ent.label_ == 'MONEY'):
                ner_money.append(ent.text)
            if(ent.label_ == 'NORP'): 
                ner_norp.append(ent.text)
            if(ent.label_ == 'GPE'): 
                ner_gpe.append(ent.text)

        if(len(ner_person) > 0):
            pii_detected['Person'].append(str(set(ner_person)))
        if(len(ner_org) > 0):
            pii_detected['Organisation'].append(str(set(ner_org)))
        if(len(ner_date) > 0):
            pii_detected ['Date'].append(str(set(ner_date)))
        if(len(ner_norp) > 0):
            pii_detected['Nationality'].append(str(set(ner_norp)))
        if(len(ner_gpe) > 0):
            pii_detected['Location' ].append(str(set(ner_gpe)))
        if(not(bool(pii_ detected))):
            pii_detected = 'NO'
        else:
            pii_detected = json.dumps(pii_detected)

        sensitive_info = defaultdict(list)
        if(len(parsed_text.credit_cards) >0 ):
            sensitive_info['Credit_Card'].append(str(parsed_text.credit_cards))
        if(len(ner_money) > 0):
            sensitive_info['Money'].append(str(ner_money))

        if(not(bool(sensitive_info))):
            sensitive_info = 'NO'
        else:
            sensitive_info = json.dumps(sensitive_info)

        if predict([message]) == [0]:
            Offensive = 'NO'
        else:
            Offensive = 'Alert!!! Objectionable language detected.\n'

        file_data_sent = {'PII':pii_detected, 'Objectionable': Offensive, 'Sensitive': sensitive_info}

        return jsonify(file_data_sent)
                            
                            
anomaly_parser = api.parser()
anomaly_parser.add_argument('file', location='files', required=True, type=FileStorage, help='Upload the file:')

@api.route('/api/v1/Anomaly_detection')
@api.response(500, 'An unexpected error occured, Please contact administrator.')
class SG_AI_API_UpLoad(Resource):
    @api.expect(anomaly_parser)
    def post(self):
        config = configparser.ConfigParser()
        config.read('config.ini')
        basePath = config['DEFAULT']['basePath']
        uploaded_file = request.files['file'] # This is Filestorage instance
                            
        st = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-&M-&S')
                            
        pos = uploaded_file. filename.rfind(".")
        UPLOAD_FOLDER = basePath + 'input/'

        final_filename= uploaded_file.filename[:pos].replace(" ","_") + "_" + st+ uploaded_file.filename[pos:]
        uploaded_file.save(os.path.join(UPLOAD_FOLDER, final_filename))
                            
        df = read_csv(UPLOAD_FOLDER + final_filename, header=None)
                            
        iso = IsolationForest(contamination=0.1)
        iso.fit(df)
        iso_pred = iso.predict(df)
        knn = KNN()
        knn.fit(df)
        knn_pred = knn.predict(df)
        oc_svm = svm.OneClassSVM()
        oc_svm.fit(df)
        oc_pred = oc_svm.predict(df)
        outliers = df[(iso_pred == -1) & (knn_pred == 1) & (oc_ pred == -1)]
        bl = benfordslaw(alpha=0.05)
        benford_array = []
        for i in range(0, df.shape[1]):
            results = bl.fit(df[[i]] .values)
            if(results.get('P') <= 0.05): 
                benford_array.append(i + 1)
                            
        benford_law = 'Anomaly detected in following column numbers as per Benford law:' + str(benford_array)

        data_sent = {'BenfordLaw': benford_law, 'Outliers': outliers.to_json()}
        
        return jsonify(data_sent)
if__name__== '__main__':
        igad_application.run(host='0,0.0.0', port=5050, debug=True, threaded=True, use_reloader=False)