# Find Indicies of Facets
This code will find indices of specific facets by a process of elemination. Given a list of strings, move through the entire document and find which columns contain which which fields.

Some of the data are easy to identify; postal code. If the columns have any consistency the postal code column will emerge with frequency analysis, that is, the column with the highest hits of postal codes is the postal code column.

Some columns may never be identified with any consistency, but hints may emerge through a process of elemination. Once all the expected data has been accounted for, any additional columns are ignored.

# Data
You can download data from [Mockeroo](https://www.mockaroo.com/schemas/447387) by curl-ing this URL (but sign in first with your **GitHub** account):
```bash
curl "https://api.mockaroo.com/api/8eaffe90?count=1000&key=c939c280" >Dirty_User_Registration_Data.csv
```

In [2]:
import re
import os.path
from os import path
from collections import defaultdict
corpus_dict = {'street': 'street_corpus.txt', 'fname': 'fname.txt', 'lname': 'lname.txt', 'city': 'alberta_towns.txt'}
STREET_CORPUS = 'street_corpus.txt'
FNAME_CORPUS = 'fname.txt'
LNAME_CORPUS  = 'lname.txt'
# Parameters that will be supplied by command arguments.
test_data = "1234-567 Ave., Edmonton, AB., T6G 0G4 ,Bisland-Jones,example@company1.com,Harold,Customer note,(780)-555-1212,Male"
f10 = ['email', 'fname', 'lname', 'street', 'pcode', 'city', 'province', 'gender', 'phone', 'note']
f02 = ['email', 'pcode', 'province', 'city', 'phone', 'street', 'lname', 'fname']
optional00 = ['gender']

class Discovery:
    def __init__(self, data: str, required_field_names: list, optional_field_names: list=[], delim: str=",", threshold: float=90.0):
        self.halt = False
        # Search for fields in this order.
        preferred_search_order = ['pcode','email','phone','province','gender','street','city','lname','fname']
        # A dictionary of strategies for finding specific data. Usually regular expressions.
        self.known_strategies = {}
        self.known_strategies['email'] = re.compile(r'^(\w|\.|\_|\-)+[@](\w|\_|\-|\.)+[.]\w{2,3}$')
        self.known_strategies['pcode'] = re.compile(r'^[a-zA-Z]\d[a-zA-Z](\s{1,})?\d[a-zA-Z]\d$')
        self.known_strategies['country'] = re.compile(r'^(CA|Canada)')
        self.known_strategies['province'] = re.compile(r'^(NL|PE|NS|NB|QC|ON|MB|SK|AB|BC|YT|NT|NU)')
        self.known_strategies['phone'] = re.compile(r'^(\+)?(\()?\d{3}(-| |\)(\s|-)?)?\d{3}(-| )?\d{4}$')
        self.known_strategies['gender'] = re.compile(r'^((M|m)ale|(F|f)emale|(P|p)refer\s+not\s+to\s+say|(N|n)ot\s+listed)$')
#         self.known_strategies['street'] = self._address_string_
#         self.known_strategies['fname'] = self._first_name_
#         self.known_strategies['lname'] = self._last_name_
        self.known_strategies['city'] = self._corpus_compare_
        self.known_strategies['street'] = self._corpus_compare_
        self.known_strategies['fname'] = self._corpus_compare_
        self.known_strategies['lname'] = self._corpus_compare_
        # Store the requested field names that we have strategies for finding.
        # The dictionary stores the field name and a boolean value; True = required, False = optional.
        self.requested_fields  = {}
        self.default_threshold = 50.0
        if threshold >= 0.0 and threshold <= 100.0:
            self.success_threshold = threshold
        else:
            print(f"*warning using default success threshold of {self.default_threshold}")
            self.success_threshold = self.default_threshold
        self.fields_collected= defaultdict(int)
        self.fields_histogram = defaultdict(int)
        self.total_records = 0
        # Order the fields so they are searched in order of the most robust algorithm to the least.
        ordered_fields_list = []
        for column in preferred_search_order:
            if column in required_field_names:
                ordered_fields_list.append(column)
        # Before doing anything see if know how to fulfill the required requests.
        for search_field in ordered_fields_list:
            if search_field in self.known_strategies.keys():
                self.requested_fields[search_field] = True
            else:
                print(f"don't know how to find required field: '{search_field}'")
        # Let's order the optional fields too.
        ordered_fields_list = []
        for column in preferred_search_order:
            if column in optional_field_names:
                ordered_fields_list.append(column)
        # Before doing anything see if know how to fulfill the required requests.
        for search_field in ordered_fields_list:
            if search_field in self.known_strategies.keys():
                self.requested_fields[search_field] = False
            else:
                print(f"don't know how to find optional field: '{search_field}'")
        if path.isfile(data):
            line_no = 0
            with open(data) as f:
                for line in f:
                    self.total_records += 1
                    cols = line.split(delim)
                    # if self.total_records <= 6:
                    #     print(line)
                    self._findRequestedData_(cols)
        else: # not a file, but a list of data
            self.total_records += 1
            cols = data.split(delim)
            self._findRequestedData_(cols)
    
    # Tries to match requested fields with known strategies for finding those fields.
    # param: cols - a list of data strings from an arbitrary but specific record.
    # retrun: Nothing, but a
    def _findRequestedData_(self, cols: list):
        for search_field in self.requested_fields.keys():
            whichColumnIndex = self._findDataIndex_(search_field, self.known_strategies[search_field], cols)
            if whichColumnIndex != None:
                self.fields_collected[search_field] = whichColumnIndex
                self.fields_histogram[search_field] += 1
    # Populates both the collected_fields dictionary and the fields' histogram dictionary.
    # param: String name of the field to try and identify, like 'pcode'.
    # param: Regular expression string used to determine the requested data.
    # param: data - list of strings from a given record.
    # return: int - Column position where the data matches, or None if the search was unsuccessful.
    def _findDataIndex_(self,field:str, strategy, data:str):
#         print(type(strategy))
        if callable(strategy):
            return strategy(data,field)
        else: # Function
            for position, d in enumerate(data):
                if re.match(strategy, d.strip()):
                    return position
            
    
    # Report if the input data was well formed, that is, are the requested columns 
    # present and well represented in the supplied data.
    # param: none.
    # return: True if all the requested requested_fields are present and at least 'n'% of 
    # records are populated with valid data, and false otherwise.
    def isWellFormed(self):
        if len(self.fields_collected) < len(self.requested_fields.keys()):
            missing_fields = []
            for r_field in self.requested_fields.keys():
                # Only required fields are checked for missing values.
                if r_field in self.fields_collected.keys():
                    continue
                else:
                    if self.requested_fields.get(r_field) == True:
                        missing_fields.append(r_field)
            print(f"failed to find the following required fields: {missing_fields}")
            return False
        # Report which fields failed to measure up.
        errors = 0
        for field, count in self.fields_histogram.items():
            # print(f"count: {count}, total_records: {self.total_records}, success_threshold: {self.success_threshold}")
            # Record errors for required fields only.
            if self.requested_fields.get(field) == True:
                if self.total_records < 1:
                    print(f"no records read")
                    return False
                frequency = count / self.total_records
                if frequency < self.success_threshold / 100:
                    print("'{0}' requires {1}% valid values but found only {2:.3g}%.".format(field,self.success_threshold,(frequency * 100.0)))
                    errors += 1
        return errors == 0
    
    # A fast way to tell if an enormous corpus has words that can be found in an arbitrary
    # but specific field is to convert the field to a set of words, then find the intersection
    # with the set of the corpus. If there is more than one successful match a word in the corpus
    # matched a word in the field.
    def _corpus_compare_(self, data: list, field: str):
        corpus_to_read = corpus_dict[field]
        if corpus_to_read == None:
            print(f"don't know how to read a corpus for '{field}'")
            return -1
        corpus: list = []
        f = open(corpus_to_read)
        for line in f.readlines():
            corpus.append(line.title().rstrip(os.linesep))
        corpus_set = set(corpus)
        print(f"first 5 entries in the corpus: {corpus[0:5]}")
        # Free up some space for really big lists.
        corpus = []
        for idx, d in enumerate(data):
            wordsInTheField = re.split(r'\W+',d)
            if len(corpus_set & set(wordsInTheField)) > 0:
                if idx not in self.fields_collected.values():
                    return idx

    # return: histogram of each field.
    def histogram(self):
        return self.fields_histogram
    
    def getColumnDefinitions(self):
        return self.fields_collected
    
    def __str__(self):
        for k,v in self.field_histogram.items():
            print(f"index: {self.fields_collected[k]}, {k} -> {v}")

discover = Discovery(test_data, f02, optional00, ",", 50.0)
# Limited test dataset
# discover = Discovery("test_data_2.csv", f02, optional00, ",", 50.0)
# Full test dataset
# discover = Discovery("test_data_2.csv", f10, ",")
# print(f"position of email: {discover.emailIndex(test_data)}")
# print(f"position of pcode: {discover.pcodeIndex(test_data)}")
# report requested_fields that have not been identified yet.
print(f"{discover.histogram()}")
for i,d in enumerate(test_data.split(",")):
    print(f"{i}:{d}")
print(f"{discover.getColumnDefinitions()}")
print(f"is the data well formed? {discover.isWellFormed()}!")

first 5 entries in the corpus: ['Avenue', 'Crescent', 'Close', 'Circus', 'Dene']
first 5 entries in the corpus: ['Athabasca', 'Banff', 'Barrhead', 'Bashaw', 'Bassano']
first 5 entries in the corpus: ['Aab', 'Aabak', 'Aabaslama', 'Aabdulrhman', 'Aabobe']
first 5 entries in the corpus: ['Aaada', 'Aaaron', 'Aaayu', 'Aabel', 'Aabhari']
defaultdict(<class 'int'>, {'pcode': 1, 'email': 1, 'phone': 1, 'province': 1, 'street': 1, 'city': 1, 'lname': 1, 'fname': 1, 'gender': 1})
0:1234-567 Ave.
1: Edmonton
2: AB.
3: T6G 0G4 
4:Bisland-Jones
5:example@company1.com
6:Harold
7:Customer note
8:(780)-555-1212
9:Male
defaultdict(<class 'int'>, {'pcode': 3, 'email': 5, 'phone': 8, 'province': 2, 'street': 0, 'city': 1, 'lname': 4, 'fname': 6, 'gender': 9})
is the data well formed? True
