In [None]:
# Constants
counter = 0
vm_ip = "127.0.0.1:8000"
headers = {'Content-Type': 'application/json'}
payload = {"host":"localhost", "user":"root",
            "password":"password","database":"Urbanwood"}

srvc_url = "http://" + vm_ip + "/ingestion/echo/"

In [None]:
# Interacts with data ingestion service
import json
import requests

def doc_insert(document_name, document_text, category):
    global counter
    counter += 1
    
    # insert data in payload
    payload["document_name"] = document_name
    payload["document_text"] = document_text
    payload["category"] = category
        
    response = requests.request("POST", srvc_url, headers=headers, data=json.dumps(payload))
    print('Document / Link - {0}: {1}'.format(counter, response.text.encode('utf8')))

In [None]:
# To preprocess text of documents and blogs
import re
def preprocess_doc(document):
    # getting asci characters only
    document = ''.join([i for i in document if ord(i) < 128 and i not in ['\n']])
    
    # removing extra spaces
    document = re.sub(' +', ' ', document)
    document = re.sub('\n', ' ', document)
    document = re.sub('\t', ' ', document)
    
    # removing escape characters
    document = document.replace('\\', '\\\\').replace('"', '\\"')
    
    # removing web links
    document = re.sub(r'http\S+', '', document)
    
    return document

In [None]:
# To parse and read pdf files
import PyPDF2
from tika import parser

def read_pdf(file_path, pages='all', lib='pypdf'):
    '''
    reads all pages of file and sends back text
    '''
    pdf_text = ''
    
    if lib == 'pypdf':
        # creating a pdf file object 
        pdfFileObj = open(file_path, 'rb') 

        # creating a pdf reader object 
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj) 

        # Decrypt file
        if pdfReader.isEncrypted:
            pdfReader.decrypt('')

        # creating a page object 
        if pages == 'all':
            for i in range(pdfReader.numPages):
                pageObj = pdfReader.getPage(0) 
                pdf_text += pageObj.extractText() 

        elif pages == 'one':
                pageObj = pdfReader.getPage(1) 
                pdf_text += pageObj.extractText()

        else:
            raise ('Wrong no of page choice')

        # closing the pdf file object 
        pdfFileObj.close()
        
    elif lib == 'tika':
        file_data = parser.from_file(file_path)
        pdf_text = file_data['content']
    
    else:
        raise Exception('Wrong choice of pdf extraction library.')
    
    return pdf_text

In [None]:
# Recursively reading all files through directories 
import os
from nltk.tokenize import sent_tokenize

counter = 0
for root, subdirs, files in os.walk('data/Urban Wood'):
    i = 0
    for file in files:
        if '.pdf' in file:
            file_path = root + '/' + file
            category = '/'.join(root.split('/')[2:])
            try:
                document_text = read_pdf(file_path, lib='tika')
            except Exception as e:
                print ('Error while reading: ', file_path)
                print ('Error: ', e)
            
            # preprocess document
            if document_text:
                document_text = preprocess_doc(document_text)
                  
                # Storing each document in DB one by one
                doc_insert(file, document_text, category)
            else:
                print ('Document was not read: ', file_path)

            # # Writing parsed documents to txt files for verification
            # filename = 'data/read_files/' + str(file_path.replace('/','+')) + '.txt'
            # file = open(filename, "w+")
            # file.write(document_text)
            # file.close()

In [None]:
# Inserting scraped data into sql
import json
import pandas as pd

with open('data/crawl_data.json') as json_file:
    crawl_data = json.load(json_file)

counter = 0
dfs = pd.read_excel('data/Websites.xlsx', sheet_name=None)['Sheet1']
for category, url in zip(dfs['Category'],dfs['URL']): 
    for index, data in enumerate(crawl_data):
        if url == data['link']:
            # clean data
            text = preprocess_doc(data['text'])
            
            # Insert into database
            doc_insert(url+'_'+str(counter), text, category)
            

In [None]:
# Deprecated code, data ingestion ends here
# # Retrieves all saved docs, to verify if docs stored or not
# import pandas as pd

# # get and display training data
# query = "SELECT * FROM training_data"

# ## getting records from the table
# mycursor.execute(query)

# ## fetching all records from the 'cursor' object
# records = mycursor.fetchall()
# field_names = [i[0] for i in mycursor.description]
# print (field_names)

# # Storing in dataframe
# name = list()
# text = list()
# category = list()
# ## Showing the data (1: name, 2: text, 3: category)
# for index, record in enumerate(records):
#     name.append(record[0])
#     text.append(record[1])
#     category.append(record[2])
    
# document_df = pd.DataFrame({'name':name, 'text':text, 'category':category})
# print (document_df.shape)
# document_df.head()

In [None]:
# Data ingestion has ended 
# Analysis (Classification using deepnet)
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pickle
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path

num_labels = len(set(document_df['category']))
vocab_size = 15000
batch_size = 100
encoder = LabelBinarizer()

# Preparing data and target
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(document_df['text'])
 
doc_data = tokenizer.texts_to_matrix(document_df['text'], mode='tfidf')
doc_target = encoder.fit_transform(document_df['category'])

In [None]:
# Preparing and training deepnet model
model = Sequential()
model.add(Dense(512, input_shape=(vocab_size,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(num_labels))
model.add(Activation('softmax'))
model.summary()
 
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
 
history = model.fit(doc_data, doc_target,
                    batch_size=batch_size,
                    epochs=30,
                    verbose=1,
                    validation_split=0.2)

In [None]:
# Deprecated code
# # DB connection
# import mysql.connector

# mydb = mysql.connector.connect(
#   host="mysql-prod3.oit.umn.edu",
#   user="cfansoaespinourbanwood",
#   passwd="2sMh$x7gyx",
#   database="cfans_oaespino_urbanwood"
# )

# mycursor = mydb.cursor()

# def doc_insert(document_name, document_text, category):
#     # Uses globally created db connections
#     # Inserting record
#     query = "INSERT INTO training_data (document_name, document_text, category) VALUES (%s, %s, %s)"

#     ## storing values in a variable
#     values = (document_name, document_text, category)

#     ## executing the query with values
#     mycursor.execute(query, values)

#     ## to make final output we have to run the 'commit()' method of the database object
#     mydb.commit()