- markdown_to_text()
- remove_nums_from_str(s)
- pre_process_sentence(sent)
- pre_process(textBlob) returns tokenized to sentences
- remove_excluded_files(file_list)
- remove_excluded_files_except(file_list, except_with_text)
- rreplace(s, old, new, occurrence)
- clean_file_name(name, replacements2=[]):
- def save_to_github(git_user, git_password, git_repo, my_file_list, push_to_git_as):
- keep_pdf_urls_only(file_list):

In [9]:
# Required Python utilities
import numpy as np
import pandas as pd

from collections import Counter
import re
from langdetect import detect
from bs4 import BeautifulSoup
from markdown import markdown
from lxml import etree
import os
import random
import tqdm
import itertools 
import pickle

## Deep Learning imports for the classifiers
os.environ['KERAS_BACKEND']='theano'

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Concatenate
from keras.models import Model

##Â ML required imports (for clustering)
from sklearn import metrics
from sklearn.decomposition import PCA, LatentDirichletAllocation
from sklearn.preprocessing import scale, StandardScaler
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN


# Topic modeling imports
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

##Â NLP related imports
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.snowball import SnowballStemmer


# visualization imports
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline
sns.set() 

import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import base64
from github import Github
from github import InputGitTreeElement
import requests
import time

In [10]:
def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    return text

In [39]:
def remove_nums_from_str(s):
    result = ''.join([i for i in s if not i.isdigit()])
    return result

In [43]:
def pre_process_sentence(sent):
    sent = re.sub('-', ' ', sent, flags=re.MULTILINE) # Added by Aideen
    sent = re.sub(' +', ' ', sent, flags=re.MULTILINE) # Added by Aideen
    sent = sent.replace(";", ", ")
    sent = re.sub(' +', ' ', sent, flags=re.MULTILINE) # Added by Aideen
    sent = sent.strip()
    sent = sent.lstrip()
    sent = sent.rstrip()
    sent = remove_nums_from_str(sent.replace(",", " "))
    sent = sent.replace("  ", " ")
    sent = sent.replace(" .", ".")

    return sent

In [32]:
def pre_process(text):
    
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', str(text))
    
    #Removing splicit line change
    document = re.sub(r'\\n', '', document, flags=re.MULTILINE)
    
    
    soup = BeautifulSoup(document)
    
    #Remove HTML code from text
    document = soup.get_text() 
    
    # Parse text from markdown code
    document = markdown_to_text(document)
        
    ## Removing URLS
    document = re.sub(r'^https?:\/\/.*[\r\n]*', '', document, flags=re.MULTILINE)
    
    ## Removing strings such as \\xe5 \\xe6 \\xe7 that appear a lot in the descriptions
    document = re.sub(r':?\\+x\w{2}', ' ', document, flags=re.MULTILINE)

    # Remove all the special characters except spaces, dashes, commas and dots
    document = re.sub(r"[^\s.,\-a-zA-Z0-9]", ' ', str(document))
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Substituting multiple '-' with single '-'
    document = re.sub(r'\-{2,50}', '', document, flags=re.I)
    
    document = re.sub('-', ' ', document, flags=re.MULTILINE) # Added by Aideen
    document = re.sub(' +', ' ', document, flags=re.MULTILINE) # Added by Aideen
    document.replace(";", ", ")
    document = re.sub(' +', ' ', document, flags=re.MULTILINE) # Added by Aideen
    # Converting to Lowercase
    document = document.lower()
    
    # Sentences Tokenization
    return sent_tokenize(document)


In [45]:
def remove_excluded_files(file_list):
    cleaned_file_list = []
    for f in file_list:
        if not f.startswith( '.' ) and not "random" in f and "gathered_" in f and not f.startswith("_"):
            cleaned_file_list.append(f)
    return cleaned_file_list

def keep_pdf_urls_only(file_list):
    cleaned_file_list = []
    for f in file_list:
        if f.endswith( '.pdf' ) and not " " in f:
            cleaned_file_list.append(f)
    return cleaned_file_list

def remove_excluded_files_except(file_list, except_with_text):
    cleaned_file_list = []
    for f in file_list:
        if not f.startswith( '.' ) and not "random" in f and except_with_text in f and not f.startswith("_"):
            cleaned_file_list.append(f)
    return cleaned_file_list
        
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)


def clean_file_name(name, replacements2=[]):
    
    replacements=[".txt", ".csv", ".tsv"]
    
    for r in replacements:
        name = name.replace(r, "")
        
    for r in replacements2:
        name = name.replace(r, "")
    return name


def save_to_github(git_user, git_password, git_repo, my_file_list, push_to_git_as):
    '''
    in order to push a file to github it must first be stored locally, then pushed
    this local location can also be local to a virtual machine. 
    takes: 
            git username, password, repo, 
            a list of files to push to git ie the full local location of file,
            a matching list of paths to push each file to in Git hub 
    '''
    user = git_user
    password = git_password
    url = git_repo
    file_list = []  #push these list of files to git
    file_names = [] #push to this location in git
    
    
    try:
        g = Github(user, password)
        repo = g.get_user().get_repo(url)

        file_list = my_file_list 
        file_names = push_to_git_as
                 
        commit_message = 'training data updated via the audit tool'
    
        master_ref = repo.get_git_ref('heads/master')
        master_sha = master_ref.object.sha
        base_tree = repo.get_git_tree(master_sha)
        element_list = list()

        for i, entry in enumerate(file_list):
            print ("file to commit:", entry)
            
            with open(entry) as input_file:
                #data = input_file.read()   #works with non zip file
                data = base64.b64encode(open(entry, "rb").read())
                
            if entry.endswith('.png'):
                data = base64.b64encode(data) 
                

            blob = repo.create_git_blob(data.decode("utf-8"), "base64")
            element = InputGitTreeElement(path=file_names[i], mode='100644', type='blob', sha=blob.sha)

            #push to git as file_names[i]
            print ("push to git as:", file_names[i])
            
            #element = InputGitTreeElement(file_names[i], '100644', 'blob', data)
            
            #element_list is a list of InputGitTreeElement. 
            #Each one corresponds to a file. 
            # the 'content' of InputGitTreeElement can only be of type 'str' or 'unicode'. 
            #When I load a file to memory I have type 'bytes'. 
            #What is the right way to encode those bytes to str or unicode to upload a .zip
            element_list.append(element)   
        
        tree = repo.create_git_tree(element_list, base_tree)
        parent = repo.get_git_commit(master_sha)
        commit = repo.create_git_commit(commit_message, tree, [parent])
        master_ref.edit(commit.sha)
        print ("File commited to github :", commit)
    except e:
        print("")
        print ("GITHUB SUBMIT FAILED:")
        print ("Are your github login credentials correct?")
        print ("Are you a collaberator in the repo?")
        print(e)



def save_to_github_not_zip(git_user, git_password, git_repo, my_file_list, push_to_git_as):
    user = git_user
    password = git_password
    url = git_repo
    file_list = []  #push these list of files to git
    file_names = [] #push to this location in git
    
    try:
        g = Github(user,password)
        repo = g.get_user().get_repo(url)

        file_list = my_file_list 
        file_names = push_to_git_as
                 
        commit_message = 'training data audited via the audit tool'
    
        master_ref = repo.get_git_ref('heads/master')
        master_sha = master_ref.object.sha
        base_tree = repo.get_git_tree(master_sha)
        element_list = list()

        for i, entry in enumerate(file_list):
            print ("open", entry)
            with open(entry) as input_file:
                data = input_file.read()
            if entry.endswith('.png'):
                data = base64.b64encode(data)
                
            print ("file to commit:", entry)
            print ("push to git as:", file_names[i])
            element = InputGitTreeElement(file_names[i], '100644', 'blob', data)
            element_list.append(element)
            
        tree = repo.create_git_tree(element_list, base_tree)
        parent = repo.get_git_commit(master_sha)
        commit = repo.create_git_commit(commit_message, tree, [parent])
        master_ref.edit(commit.sha)

        return commit

    except IOError as e:
        print ("GITHUB SUBMIT FAILED:")
        print ("Are your github login credentials correct?")
        print ("Are you a collaberator in the repo?")
        print (e)
        return "error"