In [None]:
#!/usr/bin/python3.7
# -*- coding: utf-8 -*-

from argparse import ArgumentTypeError, ArgumentParser
from nltk import tokenize, sent_tokenize, stem
from datetime import timedelta
from math import ceil, log
import preprocessor as p
from uuid import uuid4
from sys import stdout
from os import path
import logging
import codecs
import time
import re


def show_progress(iteration, total, estimation, prefix='   ', decimals=1, final=False):
    """ Print iterations progress:

    :param iteration:
    :param total:
    :param estimation:
    :param prefix:
    :param decimals:
    :param final:
    :return:
    """
    columns = 32
    eta = str(timedelta(seconds=max(0, int(ceil(estimation)))))
    bar_length = int(columns)
    str_format = "{0:." + str(decimals) + "f}"
    percents = str_format.format(100 * (iteration / float(total)))
    filled_length = int(round(bar_length * iteration / float(total)))
    bar = '█' * filled_length + '-' * (bar_length - filled_length)
    stdout.write('\r%s %s%s |%s| %s' % (prefix, percents, '%', bar, eta))

    if final:
        stdout.write('\n')

    stdout.flush()


def format_time(seconds):
    """ Format a value in seconds to "day, HH:mm:ss:

    :param seconds:
    :return:
    """
    return str(timedelta(seconds=max(0, int(ceil(seconds)))))


def str_to_bool(v):
    """ Convert a string value to boolean:

    :param v:
    :return:
    """
    if v.lower() in ('yes', 'true', 't', 'y', '1'):
        return True
    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
        return False
    else:
        raise ArgumentTypeError("invalid boolean value: %s" % str(v))


def natural(v):
    """ Verify if a value correspond to a natural number (it's an integer and bigger than 0):

    :param v:
    :return:
    """
    try:
        v = int(v)

        if v > 0:
            return v
        else:
            raise ArgumentTypeError("invalid natural number value: '%s'" % str(v))
    except ValueError:
        raise ArgumentTypeError("invalid natural number value: '%s'" % str(v))


def is_commom_word(word):
    """ Verify if a string correspond to a common word (just digits, letters, hyphens and underlines):

    :param word:
    :return:
    """
    if len(word) <= 3 or not any(l.isalpha() for l in word):
        return False

    return all(l.isalpha() or bool(re.search("[A-Za-z0-9-_\']+", l)) for l in word)


def main():
    # Defining script arguments:
    parser = ArgumentParser(description="BoW-Based Text Representation Generator")
    parser._action_groups.pop()
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional arguments')
    optional.add_argument("--log", metavar='BOOL', type=str_to_bool, action="store", dest="log", nargs="?", const=True,
                          default=False, required=False, help='display log during the process: y, [N]')
    optional.add_argument("--tokenize", metavar='BOOL', type=str_to_bool, action="store", dest="tokenize", nargs="?",
                          const=True, default=False, required=False,
                          help='specify if texts need to be tokenized: y, [N]')
    optional.add_argument("--ignore_case", metavar='BOOL', type=str_to_bool, action="store", dest="ignore_case",
                          nargs="?", const=True, default=True, required=False, help='ignore case: [Y], n')
    optional.add_argument("--stemm", metavar='BOOL', type=str_to_bool, action="store", dest="stemm", nargs="?",
                          const=True, default=False, required=False, help='enable stemming (case insensitive): y, [N]')
    optional.add_argument("--validate_words", metavar='BOOL', type=str_to_bool, action="store", dest="validate_words",
                          nargs="?", const=True, default=True, required=False,
                          help='validate vocabulary ([A-Za-z0-9-_\']+): [Y], n')
    optional.add_argument("--stoplist", metavar='FILE_PATH', type=str, action="store", dest="stoplist", default=None,
                          required=False, nargs="?", const=True, help='stoplist file')
    optional.add_argument("--doc_freq", metavar='INT', type=natural, action="store", dest="doc_freq", default=2,
                          nargs="?", const=True, required=False, help='min. frequency of documents (>= 1): [2]')
    optional.add_argument("--metric", metavar='STR', type=str, action="store", dest="metric", default="TF-IDF",
                          nargs="?", const=True, required=False, help='term relevance metric: tf, idf, [TF-IDF]')
    optional.add_argument("--print_features", metavar='BOOL', type=str_to_bool, action="store", dest="print_features",
                          nargs="?", const=True, default=True, required=False,
                          help='print features on file header: [Y], n')
    required.add_argument("--language", metavar='STR', type=str, action="store", dest="language", nargs="?", const=True,
                          required=True, help='dataset language: EN, ES, FR, DE, IT, PT')
    required.add_argument("--input", "-i", metavar='FILE_PATH', type=str, action="store", dest="input", required=True,
                          nargs="?", const=True, help='dataset input file')
    required.add_argument("--output", "-o", metavar='FILE_PATH', type=str, action="store", dest="output", required=True,
                          nargs="?", const=True, help='text representation output file')
    args = parser.parse_args()  # Verifying arguments.

    # Setup logging:
    if args.log:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    if args.language == "ES":  # Spanish.
        nltk_language = "spanish"
        stemmer = stem.snowball.SpanishStemmer()
    elif args.language == "FR":  # French.
        nltk_language = "french"
        stemmer = stem.snowball.FrenchStemmer()
    elif args.language == "DE":  # Deutsch.
        nltk_language = "german"
        stemmer = stem.snowball.GermanStemmer()
    elif args.language == "IT":  # Italian.
        nltk_language = "italian"
        stemmer = stem.snowball.ItalianStemmer()
    elif args.language == "PT":  # Portuguese.
        nltk_language = "portuguese"
        stemmer = stem.snowball.PortugueseStemmer()
    else:  # English.
        args.language = "EN"
        nltk_language = "english"
        stemmer = stem.snowball.EnglishStemmer()

    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER)
    total_start = time.time()

    ################################################################################
    ### INPUT                                                                    ###
    ################################################################################

    log_file = codecs.open("logs/BoW-log_" + time.strftime("%Y-%m-%d") + "_" + time.strftime("%H-%M-%S") + "_" +
                           str(uuid4().hex) + ".txt", "w", "utf-8")
    print("\nBoW-Based Text Representation Generator\n=======================================\n")
    log_file.write("BoW-Based Text Representation Generator\n=======================================\n\n")
    log_file.write("> Parameters:\n")

    if args.tokenize:
        log_file.write("\t- Tokenize:\t\t\tyes\n")
    else:
        log_file.write("\t- Tokenize:\t\t\tno\n")

    if args.ignore_case:
        log_file.write("\t- Ignore case:\t\tyes\n")
    else:
        log_file.write("\t- Ignore case:\t\tno\n")

    if args.stemm:
        log_file.write("\t- Stemming:\t\t\tyes\n")
    else:
        log_file.write("\t- Stemming:\t\t\tno\n")

    if args.validate_words:
        log_file.write("\t- Validate words:\tyes\n")
    else:
        log_file.write("\t- Validate words:\tno\n")

    if args.stoplist is not None:
        log_file.write("\t- Stoplist:\t\t" + args.stoplist + "\n")

    log_file.write("\t- Doc. frequency:\t>= " + str(args.doc_freq) + "\n")
    args.metric = args.metric.lower()

    if args.metric == "tf":
        log_file.write("\t- Metric:\t\t\tTF\n")
    elif args.metric == "idf":
        log_file.write("\t- Metric:\t\t\tIDF\n")
    else:
        args.metric = "tf-idf"
        log_file.write("\t- Metric:\t\t\tTF-IDF\n")

    if args.print_features:
        log_file.write("\t- Print features:\tyes\n")
    else:
        log_file.write("\t- Print features:\tno\n")

    log_file.write("\t- Language:\t\t\t" + args.language + "\n")
    log_file.write("\t- Input:\t\t\t" + args.input + "\n")
    log_file.write("\t- Output:\t\t\t" + args.output + "\n\n\n")

    if not path.exists(args.input):
        print("ERROR: input file does not exists!\n\t!Filepath: " + args.input)
        log_file.write("ERROR: input file does not exists!\n\t!Filepath: " + args.input)
        log_file.close()
        return

    print("> Loading dataset...\n\n")
    num_samples = sum(1 for line in open(args.input, encoding="utf8")) - 1  # Ignoring header line.
    stoplist = []
    log_file.write("> Dataset filepath:\t\t" + args.input + "\n")

    if args.stoplist is not None:
        print("> Loading stoplist...\n\n\n")
        stoplist_file = codecs.open(args.stoplist, "r", encoding='utf-8')

        for line in stoplist_file.readlines():
            stoplist.append(line.strip())

        if args.ignore_case:
            stoplist = [w.lower() for w in stoplist]

        stoplist.sort()
        stoplist_file.close()

    ################################################################################
    ### PRE-PROCESSING                                                           ###
    ################################################################################

    print("> Building text representation:")
    total_operations = num_samples
    num_paragraphs = 0
    num_sentences = 0
    filepath_i = 0
    eta = 0
    show_progress(filepath_i, total_operations, eta)
    operation_start = time.time()
    document_words = []
    dataset = codecs.open(args.input, "r", "UTF-8")
    classes_names = [f.strip() for f in dataset.readline().split('\t')[2:]]

    for sample in dataset:
        data_columns = sample.replace("'", "").split('\t')
        sample_id = data_columns[0].strip()
        sample_text = data_columns[1].strip()
        sample_classes = [c.strip() for c in data_columns[2:]]
        paragraphs = [p.strip() for p in p.clean(sample_text).replace("#", "").split()]  # Removing URLs and emojis.
        words = []
        num_paragraphs += len(paragraphs)

        for paragraph in paragraphs:
            sentences = sent_tokenize(paragraph, nltk_language)  # Identifying sentences.
            num_sentences += len(sentences)

            for sentence in sentences:
                if args.tokenize:
                    tokens = tokenize.word_tokenize(sentence)  # Works well for many European languages.
                else:
                    tokens = sentence.split()

                if args.ignore_case:
                    tokens = [t.lower() for t in tokens]

                if args.validate_words:
                    allowed_tokens = [t for t in tokens if
                                      is_commom_word(t) and t not in stoplist]  # Filter allowed tokens.
                else:
                    allowed_tokens = [t for t in tokens if t not in stoplist]  # Filter allowed tokens.

                for token in allowed_tokens:
                    if args.stemm:
                        new_word = stemmer.stem(token)
                    else:
                        new_word = token

                    if new_word not in stoplist:
                        words.append(new_word)

        document_words.append({"id": sample_id, "length": len(words), "words": sorted(words),
                               "classes": sample_classes})  # Sorted words.

    dataset.close()

    ############################################################################
    ### TEXT REPRESENTATION                                                  ###
    ############################################################################

    bag = []
    features = []

    for words in document_words:
        start = time.time()
        bag.append([])  # New frequencies document line.

        # Reading all words:
        for word in words["words"]:
            new_word = True

            # Checking if the word has already been inserted in features list (BoW column):
            for feature_i, feature in enumerate(features):
                if word == feature:
                    new_word = False
                    new_cell = True

                    # Searching where is it the correspondent column of current 'feature' of this line (bag[-1]):
                    for item_i, item in enumerate(bag[-1]):
                        if item['index'] == feature_i:
                            new_cell = False
                            bag[-1][item_i]['freq'] += 1
                            break

                    # Adding new correspondent item in this line:
                    if new_cell:
                        bag[-1].append({'index': feature_i, 'freq': 1})

                    break

            # Adding new word in features list (BoW column):
            if new_word:
                features.append(word)
                bag[-1].append({'index': len(features) - 1, 'freq': 1})

        filepath_i += 1
        end = time.time()
        eta = (total_operations - filepath_i) * (end - start)
        show_progress(filepath_i, total_operations, eta)

    ############################################################################
    ### FEATURES RELEVANCE                                                   ###
    ############################################################################

    bow = []
    num_features = len(features)
    output_file = codecs.open(args.output, "w", encoding='utf-8')
    doc_occurences = [0] * num_features

    for doc in bag:  # Literal frequency.
        bow.append([0.0] * num_features)

        for cell in doc:
            bow[-1][cell['index']] = cell['freq']
            doc_occurences[cell['index']] += 1.0

    if args.metric == "tf":  # Term Frequency.
        for doc_i, doc in enumerate(bow):
            for freq_j, freq in enumerate(doc):
                if document_words[doc_i]["length"] > 0:
                    bow[doc_i][freq_j] /= document_words[doc_i]["length"]
                else:
                    bow[doc_i][freq_j] = 0.0
    elif args.metric == "idf":  # Inverse Document Frequency.
        for doc_i, doc in enumerate(bow):
            for freq_j, freq in enumerate(doc):
                if doc_occurences[freq_j] > 0:
                    bow[doc_i][freq_j] = log(num_samples / doc_occurences[freq_j])
                else:
                    bow[doc_i][freq_j] = 0.0
    else:  # Term Frequency - Inverse Document Frequency:
        for doc_i, doc in enumerate(bow):
            for freq_j, freq in enumerate(doc):
                if document_words[doc_i]["length"] > 0:
                    bow[doc_i][freq_j] = (bow[doc_i][freq_j] / document_words[doc_i]["length"]) * \
                                         log(num_samples / doc_occurences[freq_j])
                else:
                    bow[doc_i][freq_j] = 0.0

    # Removing features with frequencies less than specified document frequency:
    del_indexes = []

    for feature_i, doc_occurence in enumerate(doc_occurences):
        if doc_occurence < args.doc_freq:
            del_indexes.append(feature_i)

    for index in sorted(del_indexes, reverse=True):
        del features[index]

        for doc in bow:
            del doc[index]

    num_features = len(features)

    ############################################################################
    ### OUTPUT                                                               ###
    ############################################################################

    output_file.write(str(num_samples) + " " + str(num_features) + "\n")

    if args.print_features:
        output_file.write(str("\t".join(features)) + "\t")
    else:
        output_file.write(str("\t".join(["f" + str(i) for i in range(1, num_features + 1)])) + "\t")

    output_file.write(str("\t".join(classes_names)) + "\n")

    for doc_i, doc in enumerate(bow):
        for freq in doc:
            output_file.write(str(freq) + "\t")

        output_file.write(str("\t".join(document_words[doc_i]["classes"])) + "\n")

    output_file.close()
    operation_end = time.time()
    eta = operation_end - operation_start
    show_progress(total_operations, total_operations, eta, final=True)

    total_end = time.time()
    run_time = format_time(total_end - total_start)

    print("\n\n> Log:")
    print("    - Run time:\t\t%s" % run_time)
    print("    - # samples:\t%i" % num_samples)
    print("    - # features:\t%i" % num_features)
    print("    - # paragraphs:\t%i" % num_paragraphs)
    print("    - # sentences:\t%i\n" % num_sentences)

    log_file.write("\n\n> Log:\n")
    log_file.write("\t- Run time:\t\t\t%s\n" % run_time)
    log_file.write("\t- # samples:\t\t%i\n" % num_samples)
    log_file.write("\t- # features:\t\t%i\n" % num_features)
    log_file.write("\t- # paragraphs:\t\t%i\n" % num_paragraphs)
    log_file.write("\t- # sentences:\t\t%i" % num_sentences)
    log_file.close()


if __name__ == '__main__':
    main()