This file is used by: 
- extract_sentences_from_pdf.ipynb


It contains the following Methods
- markdown_to_text()
- remove_nums_from_str(s)
- pre_process_sentence(sent)
- pre_process(textBlob) returns tokenized to sentences
- remove_excluded_files(file_list)
- remove_excluded_files_except(file_list, except_with_text)
- rreplace(s, old, new, occurrence)
- clean_file_name(name, replacements2=[]):
- def save_to_github(git_user, git_password, git_repo, my_file_list, push_to_git_as):
- def list_files_from_github_dir (owner, repo, dir_ref):
- keep_pdf_urls_only(file_list):
- def concat_files_from_github_dir (directory_base_url, file_list):  
- def read_single_file_from_github_dir (directory_base_url, file_name): 


It can be encorporated into any other notebook by using 
    -%run ./AIVM_helper_classes.ipynb

In [3]:
# Required Python utilities
import numpy as np
import pandas as pd

from collections import Counter
import re
from langdetect import detect
from bs4 import BeautifulSoup
from markdown import markdown
from lxml import etree
import os
import random
import tqdm
import itertools 
import pickle
import time
from PIL import Image
from datetime import datetime

## Deep Learning imports for the classifiers
os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Concatenate
from keras.models import Model

##Â ML required imports (for clustering)
from sklearn import metrics
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import scale, StandardScaler
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN

# Topic modeling imports
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

##Â NLP related imports
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer


# visualization imports
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline
sns.set() 

import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import base64

from github import Github, GithubException, InputGitTreeElement
from IPython.display import display, clear_output, HTML, Image

import requests as req

Using Theano backend.


In [56]:
class AIVM_Generic_helper:
    def __init__(self):
        print("initialised")

    def image_to_byte_array(self, image: Image):
        imgByteArr = io.BytesIO()
        image.save(imgByteArr, format=image.format)
        imgByteArr = imgByteArr.getvalue()
        return imgByteArr

    def time_stamp(self):
        now = datetime.now()
        timestamp = datetime.timestamp(now)
        dt_object = datetime.fromtimestamp(timestamp)
        words = str(dt_object).split(' ')
        return words[0], words[1]

    def markdown_to_text(self, markdown_string):
        """ Converts a markdown string to plaintext """

        # md -> html -> text since BeautifulSoup can extract text cleanly
        html = markdown(markdown_string)

        # remove code snippets
        html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
        html = re.sub(r'<code>(.*?)</code >', ' ', html)

        # extract text
        soup = BeautifulSoup(html, "html.parser")
        text = ''.join(soup.findAll(text=True))
        return text

    def remove_nums_from_str(self, s):
        result = ''.join([i for i in s if not i.isdigit()])
        return result

    def pre_process_sentence(self, sent):
        sent = re.sub('-', ' ', sent, flags=re.MULTILINE)  # Added by Aideen
        sent = re.sub(' +', ' ', sent, flags=re.MULTILINE)  # Added by Aideen
        sent = sent.replace(";", ", ")
        sent = re.sub(' +', ' ', sent, flags=re.MULTILINE)  # Added by Aideen
        sent = sent.strip()
        sent = sent.lstrip()
        sent = sent.rstrip()
        sent = remove_nums_from_str(sent.replace(",", " "))
        sent = sent.replace("  ", " ")
        sent = sent.replace(" .", ".")
        return sent

    def pre_process(self, text):

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', str(text))

        # Removing splicit line change
        document = re.sub(r'\\n', '', document, flags=re.MULTILINE)

        soup = BeautifulSoup(document)

        # Remove HTML code from text
        document = soup.get_text()

        # Parse text from markdown code
        document = markdown_to_text(document)

        # Removing URLS
        document = re.sub(
            r'^https?:\/\/.*[\r\n]*', '', document, flags=re.MULTILINE)

        # Removing strings such as \\xe5 \\xe6 \\xe7 that appear a lot in the descriptions
        document = re.sub(r':?\\+x\w{2}', ' ', document, flags=re.MULTILINE)

        # Remove all the special characters except spaces, dashes, commas and dots
        document = re.sub(r"[^\s.,\-a-zA-Z0-9]", ' ', str(document))

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Substituting multiple '-' with single '-'
        document = re.sub(r'\-{2,50}', '', document, flags=re.I)

        document = re.sub('-', ' ', document,
                          flags=re.MULTILINE)  # Added by Aideen
        document = re.sub(' +', ' ', document,
                          flags=re.MULTILINE)  # Added by Aideen
        document.replace(";", ", ")
        document = re.sub(' +', ' ', document,
                          flags=re.MULTILINE)  # Added by Aideen
        # Converting to Lowercase
        document = document.lower()

        # Sentences Tokenization
        return sent_tokenize(document)

    def remove_excluded_files(file_list):
        cleaned_file_list = []
        for f in file_list:
            if not f.startswith('.') and not "random" in f and "gathered_" in f and not f.startswith("_"):
                cleaned_file_list.append(f)
        return cleaned_file_list

    def keep_pdf_urls_only(file_list):
        cleaned_file_list = []
        for f in file_list:
            if f.endswith('.pdf') and not " " in f:
                cleaned_file_list.append(f)
        return cleaned_file_list

    def remove_excluded_files_except(file_list, except_with_text):
        cleaned_file_list = []
        for f in file_list:
            if not f.startswith('.') and not "random" in f and except_with_text in f and not f.startswith("_"):
                cleaned_file_list.append(f)
        return cleaned_file_list

    def rreplace(s, old, new, occurrence):
        li = s.rsplit(old, occurrence)
        return new.join(li)

    def clean_file_name(name, replacements2=[]):

        replacements = [".txt", ".csv", ".tsv"]

        for r in replacements:
            name = name.replace(r, "")

        for r in replacements2:
            name = name.replace(r, "")
        return name

    def save_to_github(git_user, git_password, git_repo, my_file_list, push_to_git_as):
        '''
        in order to push a file to github it must first be stored locally, then pushed
        this local location can also be local to a virtual machine. 
        takes: 
                git username, password, repo, 
                a list of files to push to git ie the full local location of file,
                a matching list of paths to push each file to in Git hub 
        '''
        user = git_user
        password = git_password
        url = git_repo
        file_list = []  # push these list of files to git
        file_names = []  # push to this location in git
        message = 'ok'

        try:
            g = Github(user, password)
            try:
                repo = g.get_user().get_repo(url)
            except (IOError, OSError, GithubException) as e:
                return "error", e.message

            file_list = my_file_list
            file_names = push_to_git_as

            commit_message = 'training data updated via the audit tool'

            master_ref = repo.get_git_ref('heads/master')
            master_sha = master_ref.object.sha
            base_tree = repo.get_git_tree(master_sha)
            element_list = list()

            for i, entry in enumerate(file_list):
                with open(entry) as input_file:
                    # data = input_file.read()   #works with non zip file
                    data = base64.b64encode(open(entry, "rb").read())

                if entry.endswith('.png'):
                    data = base64.b64encode(data)

                blob = repo.create_git_blob(data.decode("utf-8"), "base64")
                element = InputGitTreeElement(
                    path=file_names[i], mode='100644', type='blob', sha=blob.sha)

                # element_list is a list of InputGitTreeElement.
                # Each one corresponds to a file.
                element_list.append(element)

            tree = repo.create_git_tree(element_list, base_tree)
            parent = repo.get_git_commit(master_sha)
            commit = repo.create_git_commit(commit_message, tree, [parent])
            master_ref.edit(commit.sha)
            return commit, message
        except:
            message = "GitHub save FAILED:" + '\n' + "Are your github login credentials correct?" + \
                '\n' + "Are you a collaberator in the repo?"
            return "error", message

        def list_files_from_github_dir(owner, repo, dir_ref):

            # read data files for source data directly from github.
            # to obtain the id for the folder, navigate the tree using
            # https://api.github.com/repos/{owner}/{repo}/git/trees/master
            # e.g https://api.github.com/repos/aideenf/AIVC/git/trees/master
            # once navigated each directory will be of format
            # https://api.github.com/repos/aideenf/AIVC/git/trees/{dir_ref}
            # exampe dir_ref = 048349b4dd81d95a17129e7fcd5418bdca8309b3"

            # import requests as req  #we need to ensure we do not get cached response from browser.
            headers = {
                'Cache-Control': 'no-cache',
                'Pragma': 'no-cache',
                'If-None-Match': '',
                'If-Modified-Since': 'Thu, 14 Sep 2019 22:31:14 GMT',
                'If-None-Match': '048349b4dd81d95a17129e7fcd5418bdca8309b3'
            }
            # https://github.com/octokit/rest.js/issues/890 #need to add this for caching

            # Audited dir_ref  = 602c472723d27ff6a14e73c3e5e5da42087b73d8
            gathered_files = []
            DIR_TO_SEARCH = "https://api.github.com/repos/" + \
                owner + "/" + repo + "/git/trees/" + dir_ref
            print(DIR_TO_SEARCH)
            resp = req.get(DIR_TO_SEARCH,  headers=headers)
            response = json.loads(resp.text)
            for value in response['tree']:
                gathered_files.append(value['path'])

            return gathered_files

    def concat_files_from_github_dir(directory_base_url, file_list):
        df_list = []
        for file_name, i in zip(file_list, range(len(file_list))):
            urlBase = directory_base_url
            df_list.append(pd.read_csv(urlBase+file_name,
                                       sep='\t', error_bad_lines=False))
        # NOTE: "truncated": false  we should check for truncated = true to do follow on call to get all files
        df = pd.concat(df_list)
        return df

    def read_single_file_from_github_dir(directory_base_url, file_name):
        df_list = []
        for file_name, i in zip(file_list, range(len(file_list))):
            urlBase = directory_base_url
            df_list.append(pd.read_csv(urlBase+file_name,
                                       sep='\t', error_bad_lines=False))
        # NOTE: "truncated": false  we should check for truncated = true to do follow on call to get all files
        df = pd.concat(df_list)
        return df

In [57]:
aivm =  AIVM_Generic_helper()
print (aivm. time_stamp())

initialised
('2019-10-09', '14:37:24.067756')


In [None]:
####I think we can delete . this ver, i need to check if we are using it anywhere.
def save_to_github_not_zip(git_user, git_password, git_repo, my_file_list, push_to_git_as):
    user = git_user
    password = git_password
    url = git_repo
    file_list = []  #push these list of files to git
    file_names = [] #push to this location in git
    message = "ok"
    
    try:
        g = Github(user,password)

        try:
            repo = g.get_user().get_repo(url)
        except (IOError, OSError, GithubException) as e:
            return "error", e.message

        file_list = my_file_list 
        file_names = push_to_git_as
                 
        commit_message = 'training data audited via the audit tool'
    
        master_ref = repo.get_git_ref('heads/master')
        master_sha = master_ref.object.sha
        base_tree = repo.get_git_tree(master_sha)
        element_list = list()

        for i, entry in enumerate(file_list):
            with open(entry) as input_file:
                data = input_file.read()
            if entry.endswith('.png'):
                data = base64.b64encode(data)
                
            #print ("file to commit:", entry)
            #print ("push to git as:", file_names[i])
            element = InputGitTreeElement(file_names[i], '100644', 'blob', data)
            element_list.append(element)
            
        tree = repo.create_git_tree(element_list, base_tree)
        parent = repo.get_git_commit(master_sha)
        commit = repo.create_git_commit(commit_message, tree, [parent])
        master_ref.edit(commit.sha)
        return commit, message
    except:
        
        message = "GitHub save FAILED:" + '\n' +"Are your github login credentials correct?" + '\n' + "Are you a collaberator in the repo?"
        return "error", message

In [55]:
my_file_list = list_files_from_github_dir ("aideenf", "AIVC", "602c472723d27ff6a14e73c3e5e5da42087b73d8")
display (my_file_list)

https://api.github.com/repos/aideenf/AIVC/git/trees/602c472723d27ff6a14e73c3e5e5da42087b73d8


['audited_training_data_aideenf_2019-10-0520:03:25.312689.tsv',
 'audited_training_data_aideenf_2019-10-0520:33:33.129616.tsv',
 'audited_training_data_aideenf_2019-10-0520:49:55.413888.tsv',
 'audited_training_data_aideenf_2019-10-0617:32:53.171416.tsv',
 'audited_training_data_aideenf_2019-10-0811:12:44.116775.tsv',
 'audited_training_data_aideenf_2019-10-0814:17:38.344798.tsv',
 'audited_training_data_aideenf_2019-10-0814:37:21.889650.tsv',
 'audited_training_data_aideenf_2019-10-0814:51:31.725512.tsv',
 'audited_training_data_aideenf_2019-10-0814:58:41.347833.tsv',
 'audited_training_data_aideenf_2019-10-0815:03:23.867661.tsv',
 'audited_training_data_aideenf_2019-10-0815:11:59.552159.tsv',
 'audited_training_data_aideenf_2019-10-0815:21:20.992877.tsv',
 'audited_training_data_aideenf_2019-10-0815:32:43.192700.tsv',
 'audited_training_data_aideenf_2019-10-0815:38:10.178973.tsv',
 'audited_training_data_dsolanno_2019-10-0709:58:50.185973.tsv']

In [None]:
def delete_files_from_github(git_user, git_password, git_repo, GIT_PATH, del_files_list):
    '''
    in order to push a file to github it must first be stored locally, then pushed
    this local location can also be local to a virtual machine. 
    takes: 
            git username, password, repo, 
            a list of files to push to git ie the full local location of file,
            a matching list of paths to push each file to in Git hub 
    '''
    user = git_user
    password = git_password
    url = git_repo
    file_list = []  #push these list of files to git
    file_names = [] #push to this location in git
    message = 'ok'
    

    try:
        g = Github(user, password)
        try:
            repo = g.get_user().get_repo(url)
        except (IOError, OSError, GithubException) as e:
            return "error", e.message

        file_list = del_file_list 
                 
        commit_message = 'Deleting processed audit files'
        
        repo = g.get_repo(git_repo)
        print (repo)
        contents = repo.get_contents(GIT_PATH + "test.txt", ref="test")
        #result = repo.delete_file(contents.path, "Audit file removed automatically after processed", contents.sha, branch="test")

        print (contents)
    
       #''' 
        
        
        #'''
    except:
        message = "GitHub save FAILED:" + '\n' +"Are your github login credentials correct?" + '\n' + "Are you a collaberator in the repo?"
        return "error", message