In [1099]:
import pandas as pd
import numpy as np
import nltk
import os
import nltk.corpus
import glob


In [1100]:
from pathlib import Path
# change these to the directory where you cloned openvirusdev
HOME = str(Path.home())
OPENVIRUSDEV = HOME + "/" + "workspace/openvirusdev"
PROJECTS = HOME + "/" + "projects"


In [1101]:
# PMR's projects. 
IPYNB = OPENVIRUSDEV + "/" + "ipynb"
VIRUS2019 = OPENVIRUSDEV + "/" + "viral2019"

OPENBATTERY = PROJECTS + "/" + "open-battery"
LIION = OPENBATTERY + "/" + "liion"



In [1102]:
project = VIRUS2019
project = LIION
os.chdir(project)

In [1103]:
def get_section_globs():
    section_globs = {}

    section_globs["abstract"]  = ("abstract",   '**/sections/**/*abstract*.xml')
    section_globs["abstract1"] = ("abstract1"), '**/sections/**/*abstract*/*.xml'
    section_globs["method"]    = ("method",     '**/sections/**/*method*/*.xml')
    section_globs["all"]       = ("all",        '**/sections/**/*.xml')
    section_globs["figure"]    = ("figure",     '**/sections/**/*figure*.xml')
    section_globs["table"]     = ("table",      '**/sections/**/*table*.xml')
    section_globs["reflist"]   = ("ref-list",    '**/sections/**/*ref-list*/*.xml')

    return section_globs

In [1104]:
def get_globbed_files(directory, file_glob, recurse=True):
    """
    returns a list of files satisfying the file_glob expression
    in the context of dir
    temporarily changes directory and then resets to current dir
    recurses through the directory if recursive = True (default)    
    """
    current_dir = os.getcwd()
    os.chdir(directory)
    files = glob.glob(file_glob, recursive=recurse)
#    print("number of " + file_glob + " files in " + directory + ": " + str(len(files)))
    os.chdir(current_dir)
    return files

In [1105]:
def get_or_create_section_dirs():
    section_dirs = get_globbed_files(project, '**/sections')
    if (len(section_dirs) == 0):
        cmd = "ami -v -p " + project + " section"
        print("running: "+cmd)
        ! $cmd
        section_dirs = get_globbed_files(project, '**/sections')
        print("found: " + section_dirs)

    

In [1106]:

def get_glob_dict0():
    return get_glob_dict(get_section_globs())

def get_glob_dict(section_globs):
    
    glob_dict = {}
    for key in section_globs:
        glob = section_globs[key]
        glob_dict[key] = glob
        
    return glob_dict


In [1107]:
def read_text_contents(text_files):
    os.chdir(project)

    text_contents = []
    for text_file in text_files:
        text_filex = open(text_file,mode='r')
        text = text_filex.read()
        text_filex.close()
        text_contents.append(text)

    return text_contents


In [1108]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
import re

def remove_xml_punkt_tokenize_stopwords(text_contents, minlen=2, lang='english'):
    nltk.download('stopwords')

    stop_en = set(stopwords.words(lang))
    stop_sci = get_stopwords_sci();

    # filter words
    content_words_list = []
    for text_content in text_contents:
        # remove (most) XML markup
        text_content1 = re.sub('</?[^>]*>', '', text_content)
        words = word_tokenize(text_content1)
        # remove stopwords, punctuation, and short words
        words = [w for w in words \
                  if w.lower() not in stop_en \
                  and w not in stop_sci\
                  and w not in string.punctuation\
                  and not matches_noise(w)\
                  and len(w) >= minlen
                 ]

        content_words_list.append(words)

    content_words = [word for lizt in content_words_list for word in lizt]
    return content_words

def matches_noise(word):
    regex = re.compile("([A-Z]\.)\|(\-?\d+)")
    match = regex.fullmatch(word)
    print("noise " + word +", "+str(match))
    return not regex.fullmatch(word) == None

def get_stopwords_sci():
    words =['et', 'al', 'J.']
    return set(words)

[nltk_data] Downloading package punkt to /Users/pm286/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1109]:
def analyze_frequencies_all():
    for section in get_glob_dict0():
        analyze_frequencies0(section)
        

In [1110]:
def analyze_frequencies0(section):
    glob_expr = get_glob_dict0()[section][1]
    print("glob " + glob_expr)
    files = get_globbed_files(project, glob_expr)
    analyze_frequencies(files, section)
    print("files " + str(len(files)))

In [1111]:
def analyze_frequencies(files, title):
    text_contents = read_text_contents(files)
    words = remove_xml_punkt_tokenize_stopwords(text_contents)
    plot_frequency(words, title)

In [1112]:
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

def plot_frequency(words, title="title"):
    fdist = FreqDist(words)
    print(fdist.most_common(30))
    fdist.plot(30, title=title)


In [1113]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer 

def stem_lemmatize(words, stemmer=PorterStemmer(), lemmatizer=WordNetLemmatizer()):

    stemmed = []
    for word in words :
        if (stemmer != None):
            word1 = stemmer.stem(word)
        elif (lemmatizer != None):
            word1 = lemmatizer.lemmatize(word)
        stemmed.append(word1)
    return stemmed
    


In [1114]:
from nltk import ne_chunk

def chunk(text):    
    nltk.download('maxent_ne_chunker')
    nltk.download('words')
    # tokenize and POS Tagging before doing chunk
    tokens = word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    chunk = ne_chunk(tags)
    return chunk


In [1115]:
def noun_phrase(text):
    tokens = word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    reg = "NP: {<DT>?<JJ>*<NN>}" 
    parser = nltk.RegexpParser(reg)
    np_tree = parser.parse(tags)
    return np_tree

In [1116]:
def count_vectorize():
    """
    not tested
    """
    from sklearn.feature_extraction.text import CountVectorizer
    from nltk.tokenize import RegexpTokenizer
    #tokenizer to remove unwanted elements from out data like symbols and numbers
    token = RegexpTokenizer(r'[a-zA-Z0-9]+')
    cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)


In [1117]:
def sentence_tokenizer(text):
    from nltk.tokenize import sent_tokenize
    sentences=sent_tokenize(text)
    print("sentences: "+str(len(sentences)))
    for sentence in sentences:
        print(">> "+sentence+"\n..")
        phrases = sentence.split("\n")
        print("??"+str(len(phrases)))


In [1118]:
def tfidfVector():
    """
    not tested
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    tf=TfidfVectorizer()
    # text_tf= tf.fit_transform(data['Phrase'])
    text_tf= tf.fit_transform(phrases)

In [1119]:
def tfidf():
    """
    not tested
    """
    import pandas as pd 
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn.feature_extraction.text import CountVectorizer
    cv=CountVectorizer() 
    word_count_vector=cv.fit_transform(sentences)
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
    tfidf_transformer.fit(word_count_vector)
    # print idf values 
    df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 

    # sort ascending 
    df_idf.sort_values(by=['idf_weights'])

In [1120]:
# ==================
# from scikit-learn
# https://scikit-learn.org/stable/auto_examples/bicluster/plot_bicluster_newsgroups.html#sphx-glr-auto-examples-bicluster-plot-bicluster-newsgroups-py
# ==================

In [1121]:
# MAIN ENTRY

# analyze_frequencies0('abstract')
# analyze_frequencies_all();
regex = re.compile('xyz|\-?\d+')
for a in ['xyz', 'J.', '-10', 'et']:
    print("===")
    match = regex.fullmatch(a)
    if not match == None:
        print("matched: " + a)
    print(a + ": " + str(match))
    print(a + ": " + str(match == None))
    nmatch = matches_noise(a)
    print("nmatch " + str(nmatch))
    if not nmatch == None:
        print("noise: " + a)



===
matched: xyz
xyz: <re.Match object; span=(0, 3), match='xyz'>
xyz: False
noise xyz, None
noise: xyz
===
J.: None
J.: True
noise J., None
noise: J.
===
matched: -10
-10: <re.Match object; span=(0, 3), match='-10'>
-10: False
noise -10, None
noise: -10
===
et: None
et: True
noise et, None
noise: et
